Install bigWig and latticeExtra package
install.packages("devtools", quiet = TRUE)
library(devtools)
devtools::install_github('andrelmartins/bigWig',
subdir='bigWig')
library(bigWig)
install.packages("DESeq2", quiet = TRUE)
install.packages("dplyr", quiet = TRUE)
Install bedtools
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
brew install bedtools
Install Biostrings
if (!requireNamespace("Biostrings", quietly = TRUE)) {
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("Biostrings")
}
## 'getOption("repos")' replaces Bioconductor standard repositories, see
## 'help("repositories", package = "BiocManager")' for details.
## Replacement repositories:
## CRAN: https://cran.rstudio.com
## Bioconductor version 3.18 (BiocManager 1.30.22), R 4.3.3 (2024-02-29)
## Installing package(s) 'Biostrings'
## also installing the dependencies 'bitops', 'zlibbioc', 'RCurl', 'GenomeInfoDbData', 'BiocGenerics', 'S4Vectors', 'IRanges', 'XVector', 'GenomeInfoDb', 'crayon'
##
## The downloaded binary packages are in
## /tmp/RtmpAEQ7p6/downloaded_packages
## installing the source package 'GenomeInfoDbData'
## Old packages: 'boot'
Install latticeExtra
install.packages("latticeExtra", quiet = TRUE)
## also installing the dependencies 'Rcpp', 'deldir', 'RcppEigen', 'png', 'jpeg', 'RColorBrewer', 'interp'
In the previous analysis, I ran the RSAT-dyad analysis, using the ENCODE DHS regions as a control for calculating the expected 3mer dyad occurrence, and generated these files with dyad patterns and the corresponding statistics.
Command that I used:
rsat dyad-analysis -o GATA3_peak_161win_with_motif_1_RSAT_dyad.txt -i GATA3_peak_161win_with_motif_1.fasta -format FastA -l 3 -sp 0-20 -expfreq ENCODE.MCF7.DHS.background4.txt -return exp_occ,occ,ratio -sort -seqtype dna
# -1str single strand count; only the direct strand is considered for oligonucleotide and dyad occurrence counting.
# -2str count on oth strands
#The occurrences of each oligonucleotide are summed on both strands. This allows to detect elements which act in an orientation-insensitive way (as is generally the case for yeast upstream elements).
# -type dyad_type (dr|ir|any) any (default)
#In order to fasten execution, the program can be asked to restrict its analysis to symmetric dyads.
#Three types are accepted
#dr direct repeats: the second element is the same as the first one
#ir inverted repeats: the second element is the reverse complement of the first one.
#rep repeats: direct and inverted repeats are evaluated
#any (default)
#When selecting the option any, the analysis is performed on all non-symmetric dyads as well.
Refer to the help menu.
cd /home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/RSAT/
source ~/miniconda3/bin/activate
conda activate rsat
#rsat --help
rsat oligo-analysis -h
rsat create-background-model -h
rsat dyad-analysis -h
Files generated:
GATA3_peak_161win_with_motif_1_RSAT_dyad.txt
GATA3_peak_161win_with_motif_2_RSAT_dyad.txt
GATA3_peak_161win_with_motif_4_RSAT_dyad.txt
GATA3_peak_161win_with_motif_5_RSAT_dyad.txt
GATA3_peak_161win_with_motif_6_RSAT_dyad.txt
head -6 GATA3_peak_161win_with_motif_1_RSAT_dyad.txt
#sequence identifier expected_freq occ exp_occ ovl_occall_occ ratio
#gatn{3}atc gatn{3}atc|gatn{3}atc 0.0000716460535 12482 85.98 36 12518 145.17
#agan{4}atc agan{4}atc|gatn{4}tct 0.0003298219842 10904 390.07 4445 15349 27.95
#atan{2}atc atan{2}atc|gatn{2}tat 0.0001616794646 10453 196.05 1460 11913 53.32
#gatn{4}tca gatn{4}tca|tgan{4}atc 0.0003742452755 7872 442.61 1728 9600 17.79
#agan{3}tat agan{3}tat|atan{3}tct 0.0002979478048 7425 357.56 613 8038 20.77
The first column is the sequence pattern RSAT-dyad analysis found. Notice that the n{x} is the spacing between the elements of the dyad. The spacing is the number of bases between the end of the first element and the start of the second one.. This is different with the way we refer to as “relative distance” anchor at the G of two 3mer. We will modify this while processing the data.
The second column is all identifier for that specific pattern. Notice that while running the dyad analysis, we specified the parameter -1str to only count the direct strand. Thus, only the main structure and its reverse compliment (for both 3mer) is unquely listed in the result files. This is convenient for downstream analysis.
The final column represents the ratio of observed occurrences to expected occurrences (occ/exp_occ), which can serve as a rough “enrichment score”. I plan to extract patterns linked to GAT/ATC and create a bar chart or xyplot to visually depict their frequencies.
Notice that using this ratio may overestimate some patterns
awk -F'\t' '$1 ~ /^gat.*atc/' GATA3_peak_161win_with_motif_1_RSAT_dyad.txt | wc -l
#21
awk -F'\t' '$1 ~ /^atc.*gat/' GATA3_peak_161win_with_motif_1_RSAT_dyad.txt | wc -l
#21
In RSAT-dyad analysis, gat-atc and atc-gat means two different structure.
awk -F'\t' '$1 ~ /^gat.*atc/' GATA3_peak_161win_with_motif_1_RSAT_dyad.txt | sort -k8,8nr
echo " "
awk -F'\t' '$1 ~ /^atc.*gat/' GATA3_peak_161win_with_motif_1_RSAT_dyad.txt | sort -k8,8nr
## gatn{3}atc gatn{3}atc|gatn{3}atc 0.0000716460535 12482 85.98 36 12518 145.17
## gatn{0}atc gatn{0}atc|gatn{0}atc 0.0000515638671 614 64.15 0 614 9.57
## gatn{4}atc gatn{4}atc|gatn{4}atc 0.0000504302144 545 59.64 6 551 9.14
## gatn{16}atc gatn{16}atc|gatn{16}atc 0.0000944795637 590 97.70 9 599 6.04
## gatn{15}atc gatn{15}atc|gatn{15}atc 0.0000818438092 516 85.70 8 524 6.02
## gatn{20}atc gatn{20}atc|gatn{20}atc 0.0000942591829 547 92.71 5 552 5.90
## gatn{10}atc gatn{10}atc|gatn{10}atc 0.0000987860472 604 109.68 6 610 5.51
## gatn{17}atc gatn{17}atc|gatn{17}atc 0.0000906426659 503 92.61 5 508 5.43
## gatn{8}atc gatn{8}atc|gatn{8}atc 0.0000899257860 544 102.07 7 551 5.33
## gatn{13}atc gatn{13}atc|gatn{13}atc 0.0000906646305 516 97.24 6 522 5.31
## gatn{14}atc gatn{14}atc|gatn{14}atc 0.0000916101649 510 97.11 9 519 5.25
## gatn{6}atc gatn{6}atc|gatn{6}atc 0.0000971154028 591 112.60 4 595 5.25
## gatn{11}atc gatn{11}atc|gatn{11}atc 0.0000932630475 533 102.39 9 542 5.21
## gatn{12}atc gatn{12}atc|gatn{12}atc 0.0000888395037 493 96.41 6 499 5.11
## gatn{9}atc gatn{9}atc|gatn{9}atc 0.0000902432624 515 101.37 12 527 5.08
## gatn{18}atc gatn{18}atc|gatn{18}atc 0.0000939778165 466 94.81 7 473 4.92
## gatn{19}atc gatn{19}atc|gatn{19}atc 0.0000965793592 437 96.22 4 441 4.54
## gatn{7}atc gatn{7}atc|gatn{7}atc 0.0000847463572 423 97.33 5 428 4.35
## gatn{1}atc gatn{1}atc|gatn{1}atc 0.0000635633129 315 78.20 1 316 4.03
## gatn{5}atc gatn{5}atc|gatn{5}atc 0.0000883000715 258 103.60 2 260 2.49
## gatn{2}atc gatn{2}atc|gatn{2}atc 0.0000878513418 209 106.53 0 209 1.96
##
## atcn{1}gat atcn{1}gat|atcn{1}gat 0.0000365557250 915 44.97 6 921 20.35
## atcn{13}gat atcn{13}gat|atcn{13}gat 0.0000915682647 986 98.21 3 989 10.04
## atcn{19}gat atcn{19}gat|atcn{19}gat 0.0000783118818 524 78.02 10 534 6.72
## atcn{8}gat atcn{8}gat|atcn{8}gat 0.0000890597270 659 101.08 10 669 6.52
## atcn{17}gat atcn{17}gat|atcn{17}gat 0.0000858063102 546 87.67 8 554 6.23
## atcn{6}gat atcn{6}gat|atcn{6}gat 0.0000802195944 562 93.01 4 566 6.04
## atcn{3}gat atcn{3}gat|atcn{3}gat 0.0000953432975 686 114.42 1 687 6.00
## atcn{16}gat atcn{16}gat|atcn{16}gat 0.0000842739153 521 87.15 3 524 5.98
## atcn{11}gat atcn{11}gat|atcn{11}gat 0.0000917826817 581 100.76 7 588 5.77
## atcn{15}gat atcn{15}gat|atcn{15}gat 0.0000847558549 510 88.75 7 517 5.75
## atcn{2}gat atcn{2}gat|atcn{2}gat 0.0000822145577 559 99.69 0 559 5.61
## atcn{7}gat atcn{7}gat|atcn{7}gat 0.0000795928625 495 91.41 3 498 5.42
## atcn{9}gat atcn{9}gat|atcn{9}gat 0.0000988309276 589 111.02 2 591 5.31
## atcn{20}gat atcn{20}gat|atcn{20}gat 0.0000944194877 481 92.87 5 486 5.18
## atcn{14}gat atcn{14}gat|atcn{14}gat 0.0000920659368 505 97.60 7 512 5.17
## atcn{10}gat atcn{10}gat|atcn{10}gat 0.0000804379701 458 89.30 3 461 5.13
## atcn{18}gat atcn{18}gat|atcn{18}gat 0.0000928758991 475 93.70 3 478 5.07
## atcn{12}gat atcn{12}gat|atcn{12}gat 0.0000918257056 501 99.65 2 503 5.03
## atcn{4}gat atcn{4}gat|atcn{4}gat 0.0000924786758 548 109.37 0 548 5.01
## atcn{5}gat atcn{5}gat|atcn{5}gat 0.0000961865213 534 112.85 2 536 4.73
## atcn{0}gat atcn{0}gat|atcn{0}gat 0.0000150225440 44 18.69 0 44 2.35
Notice that:
1) the x value of n{x} (spacings) in RSAT is defined differently than in the customized analysis. For instance, gatn{3}atc has what we define in customized analysis as a 8bp relative distance between the G in gat and C in atc.
2) Notice the second column–“identifier”.
On the other hand, gat-gat and atc-atc means same structure.
awk -F'\t' '$1 ~ /^gat.*gat/' GATA3_peak_161win_with_motif_1_RSAT_dyad.txt | sort -k8,8nr
awk -F'\t' '$1 ~ /^atc.*atc/' GATA3_peak_161win_with_motif_1_RSAT_dyad.txt | sort -k8,8nr
## atcn{2}atc atcn{2}atc|gatn{2}gat 0.0001743278582 3755 211.39 179 3934 17.76
## atcn{19}atc atcn{19}atc|gatn{19}gat 0.0001834690131 1423 182.79 61 1484 7.78
## atcn{7}atc atcn{7}atc|gatn{7}gat 0.0001966917141 1525 225.90 90 1615 6.75
## atcn{10}atc atcn{10}atc|gatn{10}gat 0.0002081405868 1541 231.08 90 1631 6.67
## atcn{9}atc atcn{9}atc|gatn{9}gat 0.0002062495205 1427 231.68 145 1572 6.16
## atcn{5}atc atcn{5}atc|gatn{5}gat 0.0001401253129 970 164.40 81 1051 5.90
## atcn{12}atc atcn{12}atc|gatn{12}gat 0.0001884293340 1200 204.49 69 1269 5.87
## atcn{20}atc atcn{20}atc|gatn{20}gat 0.0001875565375 1068 184.47 62 1130 5.79
## atcn{8}atc atcn{8}atc|gatn{8}gat 0.0001814393467 1159 205.94 53 1212 5.63
## atcn{15}atc atcn{15}atc|gatn{15}gat 0.0002001648218 1151 209.60 86 1237 5.49
## atcn{16}atc atcn{16}atc|gatn{16}gat 0.0001767432755 998 182.77 59 1057 5.46
## atcn{14}atc atcn{14}atc|gatn{14}gat 0.0001838280257 1035 194.87 73 1108 5.31
## atcn{13}atc atcn{13}atc|gatn{13}gat 0.0001850944034 1024 198.52 97 1121 5.16
## atcn{4}atc atcn{4}atc|gatn{4}gat 0.0001842588721 1102 217.92 36 1138 5.06
## atcn{11}atc atcn{11}atc|gatn{11}gat 0.0001856378756 1013 203.80 60 1073 4.97
## atcn{18}atc atcn{18}atc|gatn{18}gat 0.0001950393881 960 196.77 52 1012 4.88
## atcn{17}atc atcn{17}atc|gatn{17}gat 0.0001901779857 942 194.30 111 1053 4.85
## atcn{6}atc atcn{6}atc|gatn{6}gat 0.0001786128314 991 207.10 39 1030 4.79
## atcn{0}atc atcn{0}atc|gatn{0}gat 0.0003333380702 1643 414.73 41 1684 3.96
## atcn{3}atc atcn{3}atc|gatn{3}gat 0.0001916566577 851 230.00 59 910 3.70
## atcn{1}atc atcn{1}atc|gatn{1}gat 0.0001617727236 690 199.01 53 743 3.47
The input .fasta file has 12470 sequences, the output .txt file identified 43680 patterns.
Why the output patterns are way larger than the input sequences?
If we only input one sequence, what will the output look like?
source ~/miniconda3/bin/activate
conda activate rsat
cat test_single_input.fasta
#>chr10:100072622-100072723
#CAGATTTTATCATTTATTTGCTCATGTATTCACTCACTCATTAGGTCatctatttagtcaaccaacatttacttaagtccttctctattcagagctctcag
rsat dyad-analysis -o test_single_input_RSAT_dyad.txt -i test_single_input.fasta -format FastA -l 3 -sp 0-20 -expfreq ENCODE.MCF7.DHS.background4.txt -return exp_occ,occ,ratio -sort -seqtype dna
For this single sequence input, it generates 25662 patterns.
And if we look for cat-tca pattern, it identifies several cat-tca 3mer pair with different spacings. This is different than our customized analysis. In customized analysis, for each provided sequence, we only find one specific pattern that are closest to the peak summit. If I am looking for cat-tca pattern, then it will only give my one output.
#head test_single_input_RSAT_dyad.txt
awk -F'\t' '$1 ~ /^cat.*tca/' test_single_input_RSAT_dyad.txt | sort -k8,8nr
## catn{20}tca catn{20}tca|tgan{20}atg 0.0005782191715 2 0.04 1 3 48.04
## catn{16}tca catn{16}tca|tgan{16}atg 0.0005603828788 2 0.04 0 2 47.59
## catn{8}tca catn{8}tca|tgan{8}atg 0.0005818472604 2 0.05 1 3 41.92
## catn{4}tca catn{4}tca|tgan{4}atg 0.0005971999077 2 0.05 0 2 36.40
## catn{12}tca catn{12}tca|tgan{12}atg 0.0005693193744 1 0.05 0 1 20.91
## catn{0}tca catn{0}tca|tgan{0}atg 0.0006561197581 0 0.06 0 0 0.00
## catn{10}tca catn{10}tca|tgan{10}atg 0.0005698178829 0 0.05 0 0 0.00
## catn{11}tca catn{11}tca|tgan{11}atg 0.0005810435898 0 0.05 0 0 0.00
## catn{13}tca catn{13}tca|tgan{13}atg 0.0005703437801 0 0.05 0 0 0.00
## catn{14}tca catn{14}tca|tgan{14}atg 0.0006183306318 0 0.05 0 0 0.00
## catn{15}tca catn{15}tca|tgan{15}atg 0.0005917583284 0 0.04 0 0 0.00
## catn{17}tca catn{17}tca|tgan{17}atg 0.0005702219341 0 0.04 0 0 0.00
## catn{18}tca catn{18}tca|tgan{18}atg 0.0005375782971 0 0.04 0 0 0.00
## catn{19}tca catn{19}tca|tgan{19}atg 0.0005756638123 0 0.04 0 0 0.00
## catn{1}tca catn{1}tca|tgan{1}atg 0.0006382247667 0 0.06 0 0 0.00
## catn{2}tca catn{2}tca|tgan{2}atg 0.0005706900152 0 0.05 0 0 0.00
## catn{3}tca catn{3}tca|tgan{3}atg 0.0007944812861 0 0.07 0 0 0.00
## catn{5}tca catn{5}tca|tgan{5}atg 0.0005682468718 0 0.05 0 0 0.00
## catn{6}tca catn{6}tca|tgan{6}atg 0.0005886556431 0 0.05 0 0 0.00
## catn{7}tca catn{7}tca|tgan{7}atg 0.0006022431158 0 0.05 0 0 0.00
## catn{9}tca catn{9}tca|tgan{9}atg 0.0005766835572 0 0.05 0 0 0.00
library(Biostrings)
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
process_and_subset_RSAT <- function(file_path, pattern1, pattern2, number) {
# Read the file
input_data <- read.table(file_path, header = FALSE, sep = "\t")
# Split the first column into components
components_list <- strsplit(input_data$V1, "n\\{|\\}")
processed_data <- data.frame(
first = sapply(components_list, function(x) x[1]),
dyad_distance = as.numeric(sapply(components_list, function(x) x[2])),
second = sapply(components_list, function(x) x[3]),
ratio = input_data$V8 # ratio
)
# patterns to DNAstring
pattern1<-DNAString(pattern1)
pattern2<-DNAString(pattern2)
# Get the reverse complement of the patterns
rc_pattern1 <- reverseComplement(pattern1)
rc_pattern2 <- reverseComplement(pattern2)
# Subset the dataframe based on the specified patterns and their reverse complements
if (pattern1 == pattern2) {
# Get the reverse complement of the patterns
rc_pattern1 <- reverseComplement(pattern1)
rc_pattern2 <- reverseComplement(pattern2)
# Subset the dataframe based on the specified patterns and their reverse complements
dyad_structure <- processed_data[grepl(pattern1, processed_data$first, ignore.case = TRUE) & grepl(pattern2, processed_data$second, ignore.case = TRUE), ]
dyad_structure_rc <- processed_data[grepl(rc_pattern1, processed_data$first, ignore.case = TRUE) & grepl(rc_pattern2, processed_data$second, ignore.case = TRUE), ]
output_data <- rbind(dyad_structure, dyad_structure_rc)
} else {
# If patterns are not the same, only subset based on pattern1 and pattern2
output_data <- processed_data[grepl(pattern1, processed_data$first, ignore.case = TRUE) & grepl(pattern2, processed_data$second, ignore.case = TRUE), ]
}
# Order by descending ratio
output_data <- output_data[order(-output_data$ratio), ]
# Add the relative distance column
output_data$relative_distance <- output_data$dyad_distance + number
return(output_data)
}
GAT_GAT <- process_and_subset_RSAT("./GATA3_peak_161win_with_motif_1_RSAT_dyad.txt", "GAT", "GAT", 3)
GAT_ATC <- process_and_subset_RSAT("./GATA3_peak_161win_with_motif_1_RSAT_dyad.txt", "GAT", "ATC", 5)
nrow(GAT_GAT)
## [1] 21
head(GAT_GAT)
## first dyad_distance second ratio relative_distance
## 28 atc 2 atc 17.76 5
## 1603 atc 19 atc 7.78 22
## 1264 atc 7 atc 6.75 10
## 1219 atc 10 atc 6.67 13
## 1583 atc 9 atc 6.16 12
## 5986 atc 5 atc 5.90 8
nrow(GAT_ATC)
## [1] 21
head(GAT_ATC)
## first dyad_distance second ratio relative_distance
## 1 gat 3 atc 145.17 8
## 15239 gat 0 atc 9.57 5
## 18183 gat 4 atc 9.14 9
## 16303 gat 16 atc 6.04 21
## 19417 gat 15 atc 6.02 20
## 18091 gat 20 atc 5.90 25
The GAT_GAT will be the red trace, since the second GAT is on same strand relative to the first GAT; Similarly, the GAT_ATC will be the blue trace,as the second ATC is on opposite strand relative to the first GAT.
GAT_GAT$query_status="same_strand_GAT"
GAT_ATC$query_status="opposite_strand_GAT"
df.plot=rbind(GAT_GAT, GAT_ATC)
str(df.plot)
## 'data.frame': 42 obs. of 6 variables:
## $ first : chr "atc" "atc" "atc" "atc" ...
## $ dyad_distance : num 2 19 7 10 9 5 12 20 8 15 ...
## $ second : chr "atc" "atc" "atc" "atc" ...
## $ ratio : num 17.76 7.78 6.75 6.67 6.16 ...
## $ relative_distance: num 5 22 10 13 12 8 15 23 11 18 ...
## $ query_status : chr "same_strand_GAT" "same_strand_GAT" "same_strand_GAT" "same_strand_GAT" ...
unique(df.plot$query_status)
## [1] "same_strand_GAT" "opposite_strand_GAT"
#[1] "same_strand_GAT" "opposite_strand_GAT"
head(df.plot)
## first dyad_distance second ratio relative_distance query_status
## 28 atc 2 atc 17.76 5 same_strand_GAT
## 1603 atc 19 atc 7.78 22 same_strand_GAT
## 1264 atc 7 atc 6.75 10 same_strand_GAT
## 1219 atc 10 atc 6.67 13 same_strand_GAT
## 1583 atc 9 atc 6.16 12 same_strand_GAT
## 5986 atc 5 atc 5.90 8 same_strand_GAT
df.plot$query_status = factor(df.plot$query_status, levels = c("same_strand_GAT", "opposite_strand_GAT"))
nrow(df.plot)
## [1] 42
#42
summary(df.plot)
## first dyad_distance second ratio
## Length:42 Min. : 0 Length:42 Min. : 1.960
## Class :character 1st Qu.: 5 Class :character 1st Qu.: 4.890
## Mode :character Median :10 Mode :character Median : 5.310
## Mean :10 Mean : 8.977
## 3rd Qu.:15 3rd Qu.: 5.900
## Max. :20 Max. :145.170
## relative_distance query_status
## Min. : 3 same_strand_GAT :21
## 1st Qu.: 9 opposite_strand_GAT:21
## Median :14
## Mean :14
## 3rd Qu.:19
## Max. :25
xyplot
library(lattice)
library(latticeExtra)
xyplot(ratio ~ relative_distance,
data = df.plot,
groups = query_status,
#auto.key=TRUE,
#auto.key = list(space = "right", lines=F, points=TRUE, cex = 1),
auto.key=list(space="right", points=TRUE),
#title="2nd 3mer relative to the anchor", cex.title=1),
aspect = 1,
xlim=c(0,30),
ylim=c(0, 200),
#type = c('p', 'smooth'),
xlab = "distance (bp) from 2nd closest GAT to closest GAT",
ylab="RSAT obs/exp Ratio",
main="GATA3 peak with motif1",
between=list(y=1.0),
scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
panel.xyplot(x, y,
col=c("orange","darkgreen"),
pch=18,
cex=0.6,...)
#panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
})
Loop through RSAT results use the other positive controls, extract the relative dyad patterns and make a xy/barchart:
library(Biostrings)
library(lattice)
library(latticeExtra)
#function
process_and_subset_RSAT <- function(file_path, pattern1, pattern2, number) {
# Read the file
input_data <- read.table(file_path, header = FALSE, sep = "\t")
# Split the first column into components
components_list <- strsplit(input_data$V1, "n\\{|\\}")
processed_data <- data.frame(
first = sapply(components_list, function(x) x[1]),
dyad_distance = as.numeric(sapply(components_list, function(x) x[2])),
second = sapply(components_list, function(x) x[3]),
ratio = input_data$V8 # ratio
)
# patterns to DNAstring
pattern1<-DNAString(pattern1)
pattern2<-DNAString(pattern2)
# Get the reverse complement of the patterns
rc_pattern1 <- reverseComplement(pattern1)
rc_pattern2 <- reverseComplement(pattern2)
# Subset the dataframe based on the specified patterns and their reverse complements
if (pattern1 == pattern2) {
# Get the reverse complement of the patterns
rc_pattern1 <- reverseComplement(pattern1)
rc_pattern2 <- reverseComplement(pattern2)
# Subset the dataframe based on the specified patterns and their reverse complements
dyad_structure <- processed_data[grepl(pattern1, processed_data$first, ignore.case = TRUE) & grepl(pattern2, processed_data$second, ignore.case = TRUE), ]
dyad_structure_rc <- processed_data[grepl(rc_pattern1, processed_data$first, ignore.case = TRUE) & grepl(rc_pattern2, processed_data$second, ignore.case = TRUE), ]
output_data <- rbind(dyad_structure, dyad_structure_rc)
} else {
# If patterns are not the same, only subset based on pattern1 and pattern2
output_data <- processed_data[grepl(pattern1, processed_data$first, ignore.case = TRUE) & grepl(pattern2, processed_data$second, ignore.case = TRUE), ]
}
# Order by descending ratio
output_data <- output_data[order(-output_data$ratio), ]
# Add the relative distance column
output_data$relative_distance <- output_data$dyad_distance + number
return(output_data)
}
#loop through files
for (dyad.results in Sys.glob(file.path("./GATA3_peak_161win_with_motif_*_RSAT_dyad.txt"))) {
print(dyad.results)
motif.name = paste0("motif", strsplit((strsplit(strsplit(dyad.results, "/")[[1]][length(strsplit(dyad.results, "/")[[1]])], 'GATA3_peak_161win_with_motif_')[[1]][2]), "_RSAT_dyad.txt")[[1]][1])
print(motif.name)
GAT_GAT <- process_and_subset_RSAT(dyad.results, "GAT", "GAT", 3)
GAT_ATC <- process_and_subset_RSAT(dyad.results, "GAT", "ATC", 5)
nrow(GAT_GAT)
head(GAT_GAT)
nrow(GAT_ATC)
head(GAT_ATC)
GAT_GAT$query_status="same_strand_GAT"
GAT_ATC$query_status="opposite_strand_GAT"
df.plot=rbind(GAT_GAT, GAT_ATC)
str(df.plot)
unique(df.plot$query_status)
#[1] "same_strand_GAT" "opposite_strand_GAT"
head(df.plot)
df.plot$query_status = factor(df.plot$query_status, levels = c("same_strand_GAT", "opposite_strand_GAT"))
nrow(df.plot)
summary(df.plot)
pdf(paste0('xy_RSAT_dyad_closest_2nd_GAT_to_closest_1st_GAT_GATA3_peak_with_', motif.name, '.pdf'), width=15,height=5)
print(
xyplot(ratio ~ relative_distance,
data = df.plot,
groups = query_status,
#auto.key=TRUE,
#auto.key = list(space = "right", lines=F, points=TRUE, cex = 1),
auto.key=list(space="right", points=TRUE),
#title="2nd 3mer relative to the anchor", cex.title=1),
aspect = 1,
xlim=c(0,30),
ylim=c(0, 200),
#type = c('p', 'smooth'),
xlab = "distance (bp) from 2nd closest GAT to closest GAT",
ylab="RSAT obs/exp Ratio",
main=paste0("GATA3 peak with ", motif.name),
between=list(y=1.0),
scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
panel.xyplot(x, y,
col=c("orange","darkgreen"),
pch=18,
cex=0.6,...)
#panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
})
)
dev.off()
}
In the previous analysis, we extracted dyad structures anchored at “GAT.” However, we have discussed the sensitivity of RSAT analysis to structures with two reverse-complement dyads. For motif5 analysis, we aim to focus on structures anchored specifically at “ATC” to ensure we identify the correct structure.
library(Biostrings)
process_and_subset_RSAT <- function(file_path, pattern1, pattern2, number) {
# Read the file
input_data <- read.table(file_path, header = FALSE, sep = "\t")
# Split the first column into components
components_list <- strsplit(input_data$V1, "n\\{|\\}")
processed_data <- data.frame(
first = sapply(components_list, function(x) x[1]),
dyad_distance = as.numeric(sapply(components_list, function(x) x[2])),
second = sapply(components_list, function(x) x[3]),
ratio = input_data$V8 # ratio
)
# patterns to DNAstring
pattern1<-DNAString(pattern1)
pattern2<-DNAString(pattern2)
# Get the reverse complement of the patterns
rc_pattern1 <- reverseComplement(pattern1)
rc_pattern2 <- reverseComplement(pattern2)
# Subset the dataframe based on the specified patterns and their reverse complements
if (pattern1 == pattern2) {
# Get the reverse complement of the patterns
rc_pattern1 <- reverseComplement(pattern1)
rc_pattern2 <- reverseComplement(pattern2)
# Subset the dataframe based on the specified patterns and their reverse complements
dyad_structure <- processed_data[grepl(pattern1, processed_data$first, ignore.case = TRUE) & grepl(pattern2, processed_data$second, ignore.case = TRUE), ]
dyad_structure_rc <- processed_data[grepl(rc_pattern1, processed_data$first, ignore.case = TRUE) & grepl(rc_pattern2, processed_data$second, ignore.case = TRUE), ]
output_data <- rbind(dyad_structure, dyad_structure_rc)
} else {
# If patterns are not the same, only subset based on pattern1 and pattern2
output_data <- processed_data[grepl(pattern1, processed_data$first, ignore.case = TRUE) & grepl(pattern2, processed_data$second, ignore.case = TRUE), ]
}
# Order by descending ratio
output_data <- output_data[order(-output_data$ratio), ]
# Add the relative distance column
output_data$relative_distance <- output_data$dyad_distance + number
return(output_data)
}
ATC_GAT <- process_and_subset_RSAT("./GATA3_peak_161win_with_motif_5_RSAT_dyad.txt", "ATC", "GAT", 1)
ATC_ATC <- process_and_subset_RSAT("./GATA3_peak_161win_with_motif_5_RSAT_dyad.txt", "ATC", "ATC", 3)
nrow(ATC_GAT)
## [1] 21
head(ATC_GAT)
## first dyad_distance second ratio relative_distance
## 3 atc 1 gat 271.11 2
## 7076 atc 8 gat 8.94 9
## 11672 atc 10 gat 8.03 11
## 12399 atc 6 gat 7.49 7
## 14492 atc 7 gat 6.98 8
## 12923 atc 14 gat 6.96 15
nrow(ATC_ATC)
## [1] 21
head(ATC_ATC)
## first dyad_distance second ratio relative_distance
## 14 atc 2 atc 21.52 5
## 342 atc 9 atc 8.47 12
## 849 atc 6 atc 7.87 9
## 855 atc 10 atc 7.04 13
## 1116 atc 4 atc 6.87 7
## 2937 atc 11 atc 5.81 14
ATC_ATC$query_status="same_strand_ATC"
ATC_GAT$query_status="opposite_strand_ATC"
df.plot=rbind(ATC_ATC, ATC_GAT)
str(df.plot)
## 'data.frame': 42 obs. of 6 variables:
## $ first : chr "atc" "atc" "atc" "atc" ...
## $ dyad_distance : num 2 9 6 10 4 11 7 12 14 16 ...
## $ second : chr "atc" "atc" "atc" "atc" ...
## $ ratio : num 21.52 8.47 7.87 7.04 6.87 ...
## $ relative_distance: num 5 12 9 13 7 14 10 15 17 19 ...
## $ query_status : chr "same_strand_ATC" "same_strand_ATC" "same_strand_ATC" "same_strand_ATC" ...
unique(df.plot$query_status)
## [1] "same_strand_ATC" "opposite_strand_ATC"
head(df.plot)
## first dyad_distance second ratio relative_distance query_status
## 14 atc 2 atc 21.52 5 same_strand_ATC
## 342 atc 9 atc 8.47 12 same_strand_ATC
## 849 atc 6 atc 7.87 9 same_strand_ATC
## 855 atc 10 atc 7.04 13 same_strand_ATC
## 1116 atc 4 atc 6.87 7 same_strand_ATC
## 2937 atc 11 atc 5.81 14 same_strand_ATC
df.plot$query_status = factor(df.plot$query_status, levels = c("same_strand_ATC", "opposite_strand_ATC"))
nrow(df.plot)
## [1] 42
#42
summary(df.plot)
## first dyad_distance second ratio
## Length:42 Min. : 0 Length:42 Min. : 1.850
## Class :character 1st Qu.: 5 Class :character 1st Qu.: 4.540
## Mode :character Median :10 Mode :character Median : 5.520
## Mean :10 Mean : 12.089
## 3rd Qu.:15 3rd Qu.: 6.845
## Max. :20 Max. :271.110
## relative_distance query_status
## Min. : 1 same_strand_ATC :21
## 1st Qu.: 7 opposite_strand_ATC:21
## Median :12
## Mean :12
## 3rd Qu.:17
## Max. :23
xyplot
library(lattice)
library(latticeExtra)
#pdf(paste0('xy_RSAT_dyad_closest_2nd_ATC_to_closest_1st_ATC_GATA3_peak_with_motif5.pdf'), width=10,height=5)
#print(
xyplot(ratio ~ relative_distance,
data = df.plot,
groups = query_status,
#auto.key=TRUE,
#auto.key = list(space = "right", lines=F, points=TRUE, cex = 1),
auto.key=list(space="right", points=TRUE),
#title="2nd 3mer relative to the anchor", cex.title=1),
aspect = 1,
xlim=c(0,30),
ylim=c(0, 300),
#type = c('p', 'smooth'),
xlab = "distance (bp) from 2nd closest ATC to closest ATC",
ylab="RSAT obs/exp Ratio",
main="GATA3 peak with motif5",
between=list(y=1.0),
scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.symbol = list(col=c("#8B4513", "#145A8C"), pch=18, lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
panel.xyplot(x, y,
col=c("#8B4513", "#145A8C"),
pch=18,
cex=0.6,...)
#panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
})
#)
#dev.off()
For the customized analysis:
Merge the occurrences of the anchored GAT on both the plus and minus strands.
Determine the second closest GAT relative to the strandedness of the anchored GAT. For example, if anchoring at GAT (regardless of its strand), the “minus-GAT” trace should be the sum of the occurrences of “+GAT” anchored at -GAT and “-GAT” anchored at +GAT.
Calculate the relative frequencies (y-axis in plots) as the actual occurrences of the pattern minus the frequencies from the DHS negative control.
# define function
calculate_actual_frequency <- function(data) {
# Use table() to create a frequency table
actual_frequencies <- table(data)/length(data)
result <- data.frame(value = names(actual_frequencies), actual_freq = as.vector(actual_frequencies))
return(result)
}
my_motifs = c("motif1", "motif4","motif2", "motif6", "motif5")
for (motif in my_motifs) {
print(motif)
#GATA peaks
df.plot.GATA = data.frame(matrix(nrow = 0, ncol = 5))
colnames(df.plot.GATA) = c("dis","anchor_status", "query_status","abs.dis", "actual_freq")
for (closest_2nd_dis in Sys.glob(file.path(paste0("./closest.2nd*GAT.to.1st*GAT.GATA3.", motif, ".bed")))) {
print(closest_2nd_dis)
anchor_status =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.*.to.1st.')[[1]][2]), paste0(".GATA3.", motif, ".bed"))[[1]][1]
print(anchor_status)
query_status =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.')[[1]][2]), paste0(".to.1st.*.GATA3.", motif, ".bed"))[[1]][1]
print(query_status)
temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], anchor_status, query_status))
colnames(temp) = c("dis", "anchor_status", "query_status")
temp$dis=as.integer(temp$dis)
temp$abs.dis=abs(temp$dis)
actual_frequencies = calculate_actual_frequency(temp$abs.dis)
temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
df.plot.GATA = rbind(df.plot.GATA,temp1)
}
df.plot.GATA$anchor_status = factor(df.plot.GATA$anchor_status, levels = c("plus.GAT", "minus.GAT"))
df.plot.GATA$query_status = factor(df.plot.GATA$query_status, levels = c("plus.GAT", "minus.GAT"))
uniq.df.plot.GATA=df.plot.GATA[!duplicated(df.plot.GATA), ]
#DHS regions
df.plot.DHS = data.frame(matrix(nrow = 0, ncol = 5))
colnames(df.plot.DHS) = c("dis","anchor_status", "query_status","abs.dis", "actual_freq_neg")
for (closest_2nd_dis in Sys.glob(file.path("./closest.2nd*GAT.to.1st*GAT.indep.DHS.bed"))) {
print(closest_2nd_dis)
anchor_status =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.*.to.1st.')[[1]][2]), ".indep.DHS.bed")[[1]][1]
print(anchor_status)
query_status =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.')[[1]][2]), ".to.1st.*.indep.DHS.bed")[[1]][1]
print(query_status)
temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], anchor_status, query_status))
colnames(temp) = c("dis", "anchor_status", "query_status")
temp$dis=as.integer(temp$dis)
temp$abs.dis=abs(temp$dis)
actual_frequency = calculate_actual_frequency(temp$abs.dis)
temp1= merge(temp, actual_frequency, by.x = "abs.dis", by.y = "value", all.x = TRUE)
df.plot.DHS = rbind(df.plot.DHS, temp1)
}
colnames(df.plot.DHS)[5]="actual_freq_DHS"
df.plot.DHS$anchor_status = factor(df.plot.DHS$anchor_status, levels = c("plus.GAT", "minus.GAT"))
df.plot.DHS$query_status = factor(df.plot.DHS$query_status, levels = c("plus.GAT", "minus.GAT"))
uniq.df.plot.DHS=df.plot.DHS[!duplicated(df.plot.DHS), ]
#nrow(uniq.df.plot.DHS) #[1] 2859
#calculate the relative frequency
#by subtraction of actual frequency between GATA3 peaks and DHS regions
df.plot=merge(uniq.df.plot.GATA, uniq.df.plot.DHS, by=c("abs.dis", "dis", "anchor_status", "query_status"), all.x = TRUE)
df.plot$rel_freq <- ifelse(is.na(df.plot$actual_freq_DHS), NA, df.plot$actual_freq - df.plot$actual_freq_DHS)
df.plot$strand_relationship <- ifelse(df.plot$anchor_status == df.plot$query_status,
"same_strand_GAT", "opposite_strand_GAT")
df.plot$strand_relationship = factor(df.plot$strand_relationship, levels = c("same_strand_GAT", "opposite_strand_GAT"))
library(lattice)
library(latticeExtra)
pdf(paste0('xy_closest_2nd_GAT_to_closest_1st_GAT_to_GATA3_pos_', motif, '_compare_to_DHS.pdf'), width=15,height=5)
print(xyplot(rel_freq ~ abs.dis,
data = df.plot,
groups = strand_relationship,
auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
aspect = 1,
xlim=c(0,50),
ylim=c(0, 0.3),
#type = c('p', 'smooth'),
xlab = "distance (bp) from 2nd closest GAT to closest GAT",
ylab="Frequency relative to DHS regions",
main=paste0("GATA3 peak with ", motif),
between=list(y=1.0),
scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.line = list(col=c("pink", "skyblue"), lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.densityplot(x, data = df.plot,
from=0,
to=50,
lty = c(1),
lwd=2,
darg=list(bw = "nrd0", kernel="gaussian"),
type = "count",
col=c("pink","skyblue"), ...)
panel.xyplot(x, y,
col=c("red","blue"),
pch=18,
cex=0.6,...)
panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
})
)
dev.off()
png(paste0('xy_closest_2nd_GAT_to_closest_1st_GAT_to_GATA3_pos_', motif, '_compare_to_DHS.png'))
print(xyplot(rel_freq ~ abs.dis | anchor_status,
#data = df.plot[!duplicated(df.plot), ],
data = df.plot,
groups = query_status,
auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
aspect = 1,
xlim=c(0,50),
ylim=c(0, 0.3),
#type = c('p', 'smooth'),
xlab = "distance (bp) from 2nd closest GAT to closest GAT",
ylab="Frequency relative to DHS regions",
main=paste0("GATA3 peak with ", motif),
between=list(y=1.0),
scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.line = list(col=c("pink", "skyblue"), lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.densityplot(x, data = df.plot,
from=0,
to=50,
lty = c(1),
lwd=2,
darg=list(bw = "nrd0", kernel="gaussian"),
type = "count",
col=c("pink","skyblue"), ...)
panel.xyplot(x, y,
col=c("red","blue"),
pch=18,
cex=0.6,...)
panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
})
)
dev.off()
}
Figure 1: xy plot of GATA3 peak with motif1 relative to the DHS regions: 3mer structure
The plot generated using the above code has double points at the same distance with same strand info. This is because the plus.GAT-minus.GAT and minus.GAT-plus.GAT are both labeled as “opposite_strand_GAT” and the relative frequency is slightly different (although pretty close).
See the plotting data frame below:
abs.dis dis anchor_status query_status actual_freq actual_freq_DHS
1 1 1 plus.GAT minus.GAT 0.0008025682 0.0008209464
2 1 1 minus.GAT plus.GAT 0.0009630046 0.0008557308
3 2 2 plus.GAT minus.GAT 0.0119582665 0.0027248432
4 2 2 minus.GAT plus.GAT 0.0127598106 0.0026021201
5 3 3 plus.GAT plus.GAT 0.0300938929 0.0162763486
6 3 3 plus.GAT minus.GAT 0.0760032103 0.1039283157
rel_freq strand_relationship
1 -1.837814e-05 opposite_strand_GAT
2 1.072738e-04 opposite_strand_GAT
3 9.233423e-03 opposite_strand_GAT
4 1.015769e-02 opposite_strand_GAT
5 1.381754e-02 same_strand_GAT
6 -2.792511e-02 opposite_strand_GAT
This is not ideal. So I will modify the code: instead of calculating the actual frequency for each files, I will combine the file together, create the label relative to strand info, then calculate the actual frequency.
# define function
calculate_actual_frequency <- function(data) {
# Use table() to create a frequency table
actual_frequencies <- table(data)/length(data)
result <- data.frame(value = names(actual_frequencies), actual_freq = as.vector(actual_frequencies))
return(result)
}
my_motifs = c("motif1", "motif4","motif2", "motif6", "motif5")
for (motif in my_motifs) {
print(motif)
#GATA peaks
df.plot.GATA = data.frame(matrix(nrow = 0, ncol = 4))
colnames(df.plot.GATA) = c("dis","anchor_status", "query_status","abs.dis")
for (closest_2nd_dis in Sys.glob(file.path(paste0("./closest.2nd*GAT.to.1st*GAT.GATA3.", motif, ".bed")))) {
print(closest_2nd_dis)
anchor_status =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.*.to.1st.')[[1]][2]), paste0(".GATA3.", motif, ".bed"))[[1]][1]
print(anchor_status)
query_status =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.')[[1]][2]), paste0(".to.1st.*.GATA3.", motif, ".bed"))[[1]][1]
print(query_status)
temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], anchor_status, query_status))
colnames(temp) = c("dis", "anchor_status", "query_status")
temp$dis=as.integer(temp$dis)
temp$abs.dis=abs(temp$dis)
df.plot.GATA = rbind(df.plot.GATA,temp)
}
df.plot.GATA$anchor_status = factor(df.plot.GATA$anchor_status, levels = c("plus.GAT", "minus.GAT"))
df.plot.GATA$query_status = factor(df.plot.GATA$query_status, levels = c("plus.GAT", "minus.GAT"))
df.plot.GATA$strand_relationship <- ifelse(df.plot.GATA$anchor_status == df.plot.GATA$query_status,
"same_strand_GAT", "opposite_strand_GAT")
df.plot.GATA$strand_relationship = factor(df.plot.GATA$strand_relationship, levels = c("same_strand_GAT", "opposite_strand_GAT"))
temp.g1=df.plot.GATA[df.plot.GATA$strand_relationship=="same_strand_GAT",]
actual_frequency_same_strand = calculate_actual_frequency(temp.g1$abs.dis)
temp.g2=df.plot.GATA[df.plot.GATA$strand_relationship=="opposite_strand_GAT",]
actual_frequency_oppo_strand = calculate_actual_frequency(temp.g2$abs.dis)
df.plot.GATA1=rbind(merge(temp.g1, actual_frequency_same_strand, by.x = "abs.dis", by.y = "value", all.x = TRUE), merge(temp.g2, actual_frequency_oppo_strand, by.x = "abs.dis", by.y = "value", all.x = TRUE))
uniq.df.plot.GATA=df.plot.GATA1[!duplicated(df.plot.GATA1), ]
#DHS regions
df.plot.DHS = data.frame(matrix(nrow = 0, ncol = 4))
colnames(df.plot.DHS) = c("dis","anchor_status", "query_status","abs.dis")
for (closest_2nd_dis in Sys.glob(file.path("./closest.2nd*GAT.to.1st*GAT.indep.DHS.bed"))) {
print(closest_2nd_dis)
anchor_status =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.*.to.1st.')[[1]][2]), ".indep.DHS.bed")[[1]][1]
print(anchor_status)
query_status =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.')[[1]][2]), ".to.1st.*.indep.DHS.bed")[[1]][1]
print(query_status)
temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], anchor_status, query_status))
colnames(temp) = c("dis", "anchor_status", "query_status")
temp$dis=as.integer(temp$dis)
temp$abs.dis=abs(temp$dis)
df.plot.DHS = rbind(df.plot.DHS, temp)
}
df.plot.DHS$anchor_status = factor(df.plot.DHS$anchor_status, levels = c("plus.GAT", "minus.GAT"))
df.plot.DHS$query_status = factor(df.plot.DHS$query_status, levels = c("plus.GAT", "minus.GAT"))
df.plot.DHS$strand_relationship <- ifelse(df.plot.DHS$anchor_status == df.plot.DHS$query_status,
"same_strand_GAT", "opposite_strand_GAT")
df.plot.DHS$strand_relationship = factor(df.plot.DHS$strand_relationship, levels = c("same_strand_GAT", "opposite_strand_GAT"))
temp.g1=df.plot.DHS[df.plot.DHS$strand_relationship=="same_strand_GAT",]
actual_frequency_same_strand = calculate_actual_frequency(temp.g1$abs.dis)
temp.g2=df.plot.DHS[df.plot.DHS$strand_relationship=="opposite_strand_GAT",]
actual_frequency_oppo_strand = calculate_actual_frequency(temp.g2$abs.dis)
df.plot.DHS1=rbind(merge(temp.g1, actual_frequency_same_strand, by.x = "abs.dis", by.y = "value", all.x = TRUE), merge(temp.g2, actual_frequency_oppo_strand, by.x = "abs.dis", by.y = "value", all.x = TRUE))
uniq.df.plot.DHS=df.plot.DHS1[!duplicated(df.plot.DHS1), ] #nrow(uniq.df.plot.DHS) #[1] 2859
colnames(uniq.df.plot.DHS)[6]="actual_freq_DHS"
#calculate the relative frequency
#by subtraction of actual frequency between GATA3 peaks and DHS regions
df.plot1=merge(uniq.df.plot.GATA, uniq.df.plot.DHS, by=c("abs.dis", "dis", "anchor_status", "query_status", "strand_relationship"), all.x = TRUE)
df.plot1$rel_freq <- ifelse(is.na(df.plot1$actual_freq_DHS), NA, df.plot1$actual_freq - df.plot1$actual_freq_DHS)
df.plot=df.plot1[, c(1,2,5,6,7,8)] #1104
df.plot=df.plot[!duplicated(df.plot), ] #658
library(lattice)
library(latticeExtra)
pdf(paste0('xy2_closest_2nd_GAT_to_closest_1st_GAT_to_GATA3_pos_', motif, '_compare_to_DHS.pdf'), width=15,height=5)
print(xyplot(rel_freq ~ abs.dis,
data = df.plot,
groups = strand_relationship,
auto.key=list(space="right", points=TRUE),
aspect = 1,
xlim=c(0,50),
ylim=c(0, 0.3),
#type = c('p', 'smooth'),
xlab = "distance (bp) from 2nd closest GAT to closest GAT",
ylab="Frequency relative to DHS regions",
main=paste0("GATA3 peak with ", motif),
between=list(y=1.0),
scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.symbol = list(col=c("red", "blue"), pch=18, lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
panel.densityplot(x, data = df.plot,
from=0,
to=50,
lty = c(1),
lwd=2,
darg=list(bw = "nrd0", kernel="gaussian"),
type = "count",
col=c("pink","skyblue"), ...)
panel.xyplot(x, y,
col=c("red","blue"),
pch=18,
cex=0.6,...)
})
)
dev.off()
png(paste0('xy2_closest_2nd_GAT_to_closest_1st_GAT_to_GATA3_pos_', motif, '_compare_to_DHS.png'))
print(xyplot(rel_freq ~ abs.dis,
data = df.plot,
groups = strand_relationship,
auto.key=list(space="right", points=TRUE),
aspect = 1,
xlim=c(0,50),
ylim=c(0, 0.3),
#type = c('p', 'smooth'),
xlab = "distance (bp) from 2nd closest GAT to closest GAT",
ylab="Frequency relative to DHS regions",
main=paste0("GATA3 peak with ", motif),
between=list(y=1.0),
scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.symbol = list(col=c("red", "blue"), pch=18, lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) { panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
panel.densityplot(x, data = df.plot,
from=0,
to=50,
lty = c(1),
lwd=2,
darg=list(bw = "nrd0", kernel="gaussian"),
type = "count",
col=c("pink","skyblue"), ...)
panel.xyplot(x, y,
col=c("red","blue"),
pch=18,
cex=0.6,...)
})
)
dev.off()
}
Now the plotting data frame contains only single relative frequency value grouped by strand relationship.
abs.dis dis strand_relationship actual_freq actual_freq_DHS rel_freq
1 1 1 opposite_strand_GAT 0.0008827896 0.0008383401 4.444953e-05
3 2 2 opposite_strand_GAT 0.0123590546 0.0026634763 9.695578e-03
5 3 3 same_strand_GAT 0.0286505357 0.0162428392 1.240770e-02
6 3 3 opposite_strand_GAT 0.0796516994 0.1027927204 -2.314102e-02
9 4 4 same_strand_GAT 0.0108743630 0.0088636999 2.010663e-03
10 4 4 opposite_strand_GAT 0.0085871353 0.0040607098 4.526426e-03
Figure 2: xy plot of GATA3 peak with motif1 relative to the DHS regions: 3mer structure
Figure 3: xy plot of GATA3 peak with motif2 relative to the DHS regions: 3mer structure
Figure 4: xy plot of GATA3 peak with motif4 relative to the DHS regions: 3mer structure
Figure 5: xy plot of GATA3 peak with motif5 relative to the DHS regions: 3mer structure
Figure 6: xy plot of GATA3 peak with motif6 relative to the DHS regions: 3mer structure
In our previous analysis, we placed emphasis on certain 3-mers by assessing their enrichment in GATA3 peaks relative to DHS regions. This enrichment was determined by calculating the differences in cumulative distribution function (CDF) fractions at a specified “closed” distance (both 16bp and 20bp yield the same 3-mer cluster).
These prioritized 3mer includes “AAA” “TAA” “ATA” “TTA” “AAT” “TAT” “GAT” “ATT” “TTT” “ATC”.
Here, I want to
1) Extract patterns from RSAT analysis that related to these 3mer combination and compare with the above analysis.
First use GATA3 peaks with motif1 to do some test run, then perform the analysis to all 5 positive peak sets. Finally we want to apply to the peaks without motifs.
Load the package, and the function to process the RSAT results file:
library(Biostrings)
process_and_subset_RSAT <- function(file_path, pattern1, pattern2, number) {
# Read the file
input_data <- read.table(file_path, header = FALSE, sep = "\t")
# Split the first column into components
components_list <- strsplit(input_data$V1, "n\\{|\\}")
processed_data <- data.frame(
first = sapply(components_list, function(x) x[1]),
dyad_distance = as.numeric(sapply(components_list, function(x) x[2])),
second = sapply(components_list, function(x) x[3]),
ratio = input_data$V8 # ratio
)
# patterns to DNAstring
pattern1<-DNAString(pattern1)
pattern2<-DNAString(pattern2)
# Get the reverse complement of the patterns
rc_pattern1 <- reverseComplement(pattern1)
rc_pattern2 <- reverseComplement(pattern2)
# Subset the dataframe based on the specified patterns and their reverse complements
if (pattern1 == pattern2) {
# Get the reverse complement of the patterns
rc_pattern1 <- reverseComplement(pattern1)
rc_pattern2 <- reverseComplement(pattern2)
# Subset the dataframe based on the specified patterns and their reverse complements
dyad_structure <- processed_data[grepl(pattern1, processed_data$first, ignore.case = TRUE) & grepl(pattern2, processed_data$second, ignore.case = TRUE), ]
dyad_structure_rc <- processed_data[grepl(rc_pattern1, processed_data$first, ignore.case = TRUE) & grepl(rc_pattern2, processed_data$second, ignore.case = TRUE), ]
output_data <- rbind(dyad_structure, dyad_structure_rc)
} else {
# If patterns are not the same, only subset based on pattern1 and pattern2
output_data <- processed_data[grepl(pattern1, processed_data$first, ignore.case = TRUE) & grepl(pattern2, processed_data$second, ignore.case = TRUE), ]
}
# Order by descending ratio
output_data <- output_data[order(-output_data$ratio), ]
# Add the relative distance column
output_data$relative_distance <- output_data$dyad_distance + number
return(output_data)
}
In the provided function, pattern1 represents the anchored dyad, pattern2 denotes the other half dyad, and number signifies the user-defined distance (representing the relative separation between two zinc fingers, with “G” serving as the single resolution) when there is no spacing between each dyad.
For AAT and ATT, there is two possible relative “G/C”. One is (gat)AAT/ATT(atc), one is AAT(c)/(g)ATT.
I have examined the Information Content (IC) in all Sequence logos of the Information Content Matrix, and examined the letter-probability matrix to determine which configiration is more common/more likely to observe.
Overall, AAT(c)/(g)ATT have higher IC compared to (gat)AAT/ATT(atc).
I will use AAT(c)/(g)ATT to anchor at the ‘G/C’ base for measuring the relative distance between the two zinc fingers.
I’ve created a file that simplifies the computation of the ‘number’ for any given pair of dyads. It operates by adding the corresponding numbers associated with the anchor (pattern1) or query (pattern2) pattern.
#pattern_anchor_at_G_compute_dis.csv
read.csv('pattern_anchor_at_G_compute_dis.csv')
## pattern identifier anchor query
## 1 GAT GAT 3 0
## 2 ATC ATC 1 2
## 3 ATA (g)ATA 4 -1
## 4 TAT TAT(c) 0 3
## 5 TTA TTA(tc) -1 4
## 6 TAA (ga)TAA 5 -2
## 7 AAT AAT(c) 0 3
## 8 ATT (g)ATT 4 -1
## 9 AAA (gat)AAA 6 -3
## 10 TTT TTT(atc) -2 5
## 11 AGA AGA(t) 2 1
## 12 TCT (a)TCT 2 1
## 13 TAG (ga)TAG 5 -2
## 14 CTA CTA(tc) -1 4
library(lattice)
library(latticeExtra)
library(Biostrings)
my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
Anchor_dyad = c("AAA", "TAA" ,"ATA" ,"TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC")
Query_dyad =c("AAA", "TAA" ,"ATA" ,"TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC")
compute_dis_df=read.csv('pattern_anchor_at_G_compute_dis.csv')
for (motif in my_motifs) {
print(motif)
rsat_results=paste0("GATA3_peak_161win_with_", motif, "_RSAT_dyad.txt")
for (pattern1 in Anchor_dyad){
print(pattern1)
for (pattern2 in Query_dyad){
print(pattern2)
number1=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==pattern2, "query"]
rc_pattern2=reverseComplement(DNAString(pattern2))
print(rc_pattern2)
number2=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==as.character(rc_pattern2), "query"]
ss <- process_and_subset_RSAT(rsat_results, pattern1, pattern2, number1)
os <- process_and_subset_RSAT(rsat_results, pattern1, rc_pattern2, number2)
if(nrow(ss) > 0) {
ss$query_status <- paste0("same_strand_", pattern2)
} else {
ss <- data.frame(matrix(NA, ncol = ncol(ss), nrow = 1)) # Fill ss dataframe with NA values for all columns
colnames(ss) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
ss$query_status <- paste0("same_strand_", pattern2)
}
if(nrow(os) > 0) {
os$query_status <- paste0("opposite_strand_", pattern2)
} else {
os <- data.frame(matrix(NA, ncol = ncol(os), nrow = 1)) # Fill os dataframe with NA values for all columns
colnames(os) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
os$query_status <- paste0("opposite_strand_", pattern2)
}
df.plot=rbind(ss, os)
df.plot$query_status = factor(df.plot$query_status, levels = c(paste0("same_strand_", pattern2), paste0("opposite_strand_", pattern2)))
# xy plot
pdf(paste0('test_xy_RSAT_dyad_closest_2nd_', pattern2, '_to_closest_1st_', pattern1, '_to_GATA3_pos_', motif, '_compare_to_DHS.pdf'), width=10,height=6)
print(xyplot(ratio ~ relative_distance,
data = df.plot,
groups = query_status,
auto.key=list(space="right", points=TRUE),
aspect = 1,
xlim=c(0,30),
ylim=c(0, 500),
xlab = paste0("distance (bp) from 2nd closest ",pattern2, " to closest ", pattern1),
ylab="RSAT obs/exp Ratio",
main=paste0("GATA3 peak with ", motif),
between=list(y=1.0),
scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
panel.xyplot(x, y,
col=c("orange","darkgreen"),
pch=18,
cex=0.6,...)
#panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
}))
dev.off()
}
}
}
test:
library(lattice)
library(latticeExtra)
library(Biostrings)
my_motifs = c("motif_1")
Anchor_dyad = c("GAT")
Query_dyad =c("GAT")
compute_dis_df=read.csv('pattern_anchor_at_G_compute_dis.csv')
for (motif in my_motifs) {
print(motif)
rsat_results=paste0("GATA3_peak_161win_with_", motif, "_RSAT_dyad.txt")
for (pattern1 in Anchor_dyad){
print(pattern1)
for (pattern2 in Query_dyad){
print(pattern2)
number1=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==pattern2, "query"]
rc_pattern2=reverseComplement(DNAString(pattern2))
print(rc_pattern2)
number2=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==as.character(rc_pattern2), "query"]
ss <- process_and_subset_RSAT(rsat_results, pattern1, pattern2, number1)
os <- process_and_subset_RSAT(rsat_results, pattern1, rc_pattern2, number2)
if(nrow(ss) > 0) {
ss$query_status <- paste0("same_strand_", pattern2)
} else {
ss <- data.frame(matrix(NA, ncol = ncol(ss), nrow = 1)) # Fill ss dataframe with NA values for all columns
colnames(ss) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
ss$query_status <- paste0("same_strand_", pattern2)
}
if(nrow(os) > 0) {
os$query_status <- paste0("opposite_strand_", pattern2)
} else {
os <- data.frame(matrix(NA, ncol = ncol(os), nrow = 1)) # Fill os dataframe with NA values for all columns
colnames(os) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
os$query_status <- paste0("opposite_strand_", pattern2)
}
df.plot=rbind(ss, os)
df.plot$query_status = factor(df.plot$query_status, levels = c(paste0("same_strand_", pattern2), paste0("opposite_strand_", pattern2)))
# xy plot
pdf(paste0('test_xy_RSAT_dyad_closest_2nd_', pattern2, '_to_closest_1st_', pattern1, '_to_GATA3_pos_', motif, '_compare_to_DHS.pdf'), width=10,height=6)
print(xyplot(ratio ~ relative_distance,
data = df.plot,
groups = query_status,
auto.key=list(space="right", points=TRUE),
aspect = 1,
xlim=c(0,30),
ylim=c(0, 500),
xlab = paste0("distance (bp) from 2nd closest ",pattern2, " to closest ", pattern1),
ylab="RSAT obs/exp Ratio",
main=paste0("GATA3 peak with ", motif),
between=list(y=1.0),
scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
panel.xyplot(x, y,
col=c("orange","darkgreen"),
pch=18,
cex=0.6,...)
#panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
}))
dev.off()
}
}
}
test2:
library(lattice)
library(latticeExtra)
library(Biostrings)
my_motifs = c("motif_1")
Anchor_dyad = c("TAA")
Query_dyad =c("AAA")
compute_dis_df=read.csv('pattern_anchor_at_G_compute_dis.csv')
for (motif in my_motifs) {
print(motif)
rsat_results=paste0("GATA3_peak_161win_with_", motif, "_RSAT_dyad.txt")
for (pattern1 in Anchor_dyad){
print(pattern1)
for (pattern2 in Query_dyad){
print(pattern2)
number1=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==pattern2, "query"]
rc_pattern2=reverseComplement(DNAString(pattern2))
print(rc_pattern2)
number2=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==as.character(rc_pattern2), "query"]
ss <- process_and_subset_RSAT(rsat_results, pattern1, pattern2, number1)
os <- process_and_subset_RSAT(rsat_results, pattern1, rc_pattern2, number2)
if(nrow(ss) > 0) {
ss$query_status <- paste0("same_strand_", pattern2)
} else {
ss <- data.frame(matrix(NA, ncol = ncol(ss), nrow = 1)) # Fill ss dataframe with NA values for all columns
colnames(ss) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
ss$query_status <- paste0("same_strand_", pattern2)
}
if(nrow(os) > 0) {
os$query_status <- paste0("opposite_strand_", pattern2)
} else {
os <- data.frame(matrix(NA, ncol = ncol(os), nrow = 1)) # Fill os dataframe with NA values for all columns
colnames(os) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
os$query_status <- paste0("opposite_strand_", pattern2)
}
df.plot=rbind(ss, os)
df.plot$query_status = factor(df.plot$query_status, levels = c(paste0("same_strand_", pattern2), paste0("opposite_strand_", pattern2)))
# xy plot
pdf(paste0('test_xy_RSAT_dyad_closest_2nd_', pattern2, '_to_closest_1st_', pattern1, '_to_GATA3_pos_', motif, '_compare_to_DHS.pdf'), width=10,height=6)
print(xyplot(ratio ~ relative_distance,
data = df.plot,
groups = query_status,
auto.key=list(space="right", points=TRUE),
aspect = 1,
xlim=c(0,30),
ylim=c(0, 500),
xlab = paste0("distance (bp) from 2nd closest ",pattern2, " to closest ", pattern1),
ylab="RSAT obs/exp Ratio",
main=paste0("GATA3 peak with ", motif),
between=list(y=1.0),
scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
panel.xyplot(x, y,
col=c("orange","darkgreen"),
pch=18,
cex=0.6,...)
#panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
}))
dev.off()
}
}
}
Anchor at one of the top 10 prioritized 3-mers, then query the remaining 10 3-mers on either the same strand or the opposite strand. This generates a 10x10=100 enrichment graph for each of the five peak data sets, resulting in a total of 500 enrichment graphs.
Another method to identify the most enriched structure within each peak dataset is by ranking the files (from the 5 peak sets) based on the observed-to-expected (obs/exp) ratio. This allows us to determine which structural motif is enriched, followed by a search for the corresponding enrichment graph.
Files generated:
GATA3_peak_161win_with_motif_1_RSAT_dyad.txt
GATA3_peak_161win_with_motif_2_RSAT_dyad.txt
GATA3_peak_161win_with_motif_4_RSAT_dyad.txt
GATA3_peak_161win_with_motif_5_RSAT_dyad.txt
GATA3_peak_161win_with_motif_6_RSAT_dyad.txt
GATA3 peak with motif1
cat GATA3_peak_161win_with_motif_1_RSAT_dyad.txt | sort -k8,8nr | head -3
## agan{0}taa agan{0}taa|ttan{0}tct 0.0000096090146 5168 11.96 10 5178 432.28
## gatn{3}atc gatn{3}atc|gatn{3}atc 0.0000716460535 12482 85.98 36 12518 145.17
## atan{2}atc atan{2}atc|gatn{2}tat 0.0001616794646 10453 196.05 1460 11913 53.32
GATA3 peak with motif2
cat GATA3_peak_161win_with_motif_2_RSAT_dyad.txt | sort -k8,8nr | head -3
## agan{0}taa agan{0}taa|ttan{0}tct 0.0000096090146 5367 11.05 15 5382 485.84
## gatn{4}atc gatn{4}atc|gatn{4}atc 0.0000504302144 10109 55.31 57 10166 182.76
## atan{3}atc atan{3}atc|gatn{3}tat 0.0001614461361 8061 178.78 812 8873 45.09
GATA3 peak with motif4
cat GATA3_peak_161win_with_motif_4_RSAT_dyad.txt | sort -k8,8nr | head -8
## agan{0}taa agan{0}taa|ttan{0}tct 0.0000096090146 2635 6.31 0 2635 417.35
## atcn{5}atc atcn{5}atc|gatn{5}gat 0.0001401253129 8136 86.45 3041 11177 94.11
## tagn{0}ata tagn{0}ata|tatn{0}cta 0.0001101653224 4974 72.39 2520 7494 68.72
## gatn{6}ata gatn{6}ata|tatn{6}atc 0.0001922430633 5856 117.34 2756 8612 49.91
## atan{0}gat atan{0}gat|atcn{0}tat 0.0001759397041 5311 115.60 2653 7964 45.94
## atan{4}gat atan{4}gat|atcn{4}tat 0.0001505921639 4145 93.97 2862 7007 44.11
## gatn{2}ata gatn{2}ata|tatn{2}atc 0.0001517807219 4287 97.37 2333 6620 44.03
## atcn{1}atc atcn{1}atc|gatn{1}gat 0.0001617727236 4361 105.00 2663 7024 41.53
GATA3 peak with motif5
cat GATA3_peak_161win_with_motif_5_RSAT_dyad.txt | sort -k8,8nr | head -10
## agan{0}taa agan{0}taa|ttan{0}tct 0.0000096090146 1460 4.15 0 1460 351.92
## atcn{1}gat atcn{1}gat|atcn{1}gat 0.0000365557250 4232 15.61 14 4246 271.11
## atcn{2}ata atcn{2}ata|tatn{2}gat 0.0001323956841 3042 55.76 429 3471 54.55
## gatn{0}aac gatn{0}aac|gttn{0}atc 0.0000457443231 612 19.75 0 612 30.99
## ctgn{0}ata ctgn{0}ata|tatn{0}cag 0.0001502254396 1839 64.86 5 1844 28.35
## atcn{0}tga atcn{0}tga|tcan{0}gat 0.0004057440254 4520 175.18 50 4570 25.80
## gatn{0}aaa gatn{0}aaa|tttn{0}atc 0.0001042104402 1134 44.99 0 1134 25.20
## atcn{0}aga atcn{0}aga|tctn{0}gat 0.0004149470253 4458 179.15 40 4498 24.88
## atcn{3}taa atcn{3}taa|ttan{3}gat 0.0001898551128 1853 79.23 91 1944 23.39
## aatn{2}gat aatn{2}gat|atcn{2}att 0.0002558550032 2514 107.76 354 2868 23.33
awk -F'\t' '$1 ~ /^gat.*gat/' GATA3_peak_161win_with_motif_5_RSAT_dyad.txt | head
awk -F'\t' '$1 ~ /^atc.*atc/' GATA3_peak_161win_with_motif_5_RSAT_dyad.txt | head
echo ""
awk -F'\t' '$1 ~ /^gat.*atc/' GATA3_peak_161win_with_motif_5_RSAT_dyad.txt | head
echo ""
awk -F'\t' '$1 ~ /^atc.*gat/' GATA3_peak_161win_with_motif_5_RSAT_dyad.txt | head
## atcn{2}atc atcn{2}atc|gatn{2}gat 0.0001743278582 1580 73.43 190 1770 21.52
## atcn{9}atc atcn{9}atc|gatn{9}gat 0.0002062495205 684 80.71 39 723 8.47
## atcn{6}atc atcn{6}atc|gatn{6}gat 0.0001786128314 568 72.21 29 597 7.87
## atcn{10}atc atcn{10}atc|gatn{10}gat 0.0002081405868 567 80.51 34 601 7.04
## atcn{4}atc atcn{4}atc|gatn{4}gat 0.0001842588721 523 76.09 37 560 6.87
## atcn{7}atc atcn{7}atc|gatn{7}gat 0.0001966917141 445 78.65 32 477 5.66
## atcn{11}atc atcn{11}atc|gatn{11}gat 0.0001856378756 413 71.03 26 439 5.81
## atcn{12}atc atcn{12}atc|gatn{12}gat 0.0001884293340 393 71.25 26 419 5.52
## atcn{15}atc atcn{15}atc|gatn{15}gat 0.0002001648218 389 73.16 18 407 5.32
## atcn{14}atc atcn{14}atc|gatn{14}gat 0.0001838280257 375 67.98 56 431 5.52
##
## gatn{5}atc gatn{5}atc|gatn{5}atc 0.0000883000715 329 36.10 1 330 9.11
## gatn{3}atc gatn{3}atc|gatn{3}atc 0.0000716460535 239 29.90 6 245 7.99
## gatn{0}atc gatn{0}atc|gatn{0}atc 0.0000515638671 228 22.26 0 228 10.24
## gatn{8}atc gatn{8}atc|gatn{8}atc 0.0000899257860 216 35.58 2 218 6.07
## gatn{11}atc gatn{11}atc|gatn{11}atc 0.0000932630475 214 35.68 1 215 6.00
## gatn{6}atc gatn{6}atc|gatn{6}atc 0.0000971154028 213 39.26 2 215 5.43
## gatn{10}atc gatn{10}atc|gatn{10}atc 0.0000987860472 209 38.21 0 209 5.47
## gatn{2}atc gatn{2}atc|gatn{2}atc 0.0000878513418 208 37.00 0 208 5.62
## gatn{7}atc gatn{7}atc|gatn{7}atc 0.0000847463572 192 33.89 0 192 5.67
## gatn{12}atc gatn{12}atc|gatn{12}atc 0.0000888395037 186 33.59 3 189 5.54
##
## atcn{1}gat atcn{1}gat|atcn{1}gat 0.0000365557250 4232 15.61 14 4246 271.11
## atcn{8}gat atcn{8}gat|atcn{8}gat 0.0000890597270 315 35.23 5 320 8.94
## atcn{10}gat atcn{10}gat|atcn{10}gat 0.0000804379701 250 31.11 3 253 8.03
## atcn{6}gat atcn{6}gat|atcn{6}gat 0.0000802195944 243 32.43 3 246 7.49
## atcn{14}gat atcn{14}gat|atcn{14}gat 0.0000920659368 237 34.05 3 240 6.96
## atcn{13}gat atcn{13}gat|atcn{13}gat 0.0000915682647 232 34.24 2 234 6.77
## atcn{7}gat atcn{7}gat|atcn{7}gat 0.0000795928625 222 31.83 4 226 6.98
## atcn{11}gat atcn{11}gat|atcn{11}gat 0.0000917826817 219 35.12 4 223 6.24
## atcn{18}gat atcn{18}gat|atcn{18}gat 0.0000928758991 201 32.74 1 202 6.14
## atcn{15}gat atcn{15}gat|atcn{15}gat 0.0000847558549 185 30.98 1 186 5.97
GATA3 peak with motif6
cat GATA3_peak_161win_with_motif_6_RSAT_dyad.txt | sort -k8,8nr | head -3
## agan{0}taa agan{0}taa|ttan{0}tct 0.0000096090146 2445 5.38 2 2447 454.40
## atcn{6}atc atcn{6}atc|gatn{6}gat 0.0001786128314 3871 93.56 216 4087 41.37
## gatn{7}ata gatn{7}ata|tatn{7}atc 0.0001561795197 3011 80.92 119 3130 37.21
In the above 5 GATA3 peak data sets, it seems that AGA-TAA, GAT-ATC, ATA-ATC, ATC-ATC, TAG-ATA, ATC-GAT, ATC-ATA and GAT-ATA are enriched in GATA3 peaks contains GATA3 motifs.
Unfortunately, AGA is not included in our prioritized list, but it is ranked just after TTT/AAA. Therefore, we can include an analysis of this 3-mer here.
And in our customized analysis, we might also include AGA/TCT to our prioritized 3mer list.
library(lattice)
library(latticeExtra)
library(Biostrings)
my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
Anchor_dyad = c("AGA")
Query_dyad =c("TAA")
compute_dis_df=read.csv('pattern_anchor_at_G_compute_dis.csv')
for (motif in my_motifs) {
print(motif)
rsat_results=paste0("GATA3_peak_161win_with_", motif, "_RSAT_dyad.txt")
for (pattern1 in Anchor_dyad){
print(pattern1)
for (pattern2 in Query_dyad){
print(pattern2)
number1=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==pattern2, "query"]
rc_pattern2=reverseComplement(DNAString(pattern2))
print(rc_pattern2)
number2=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==as.character(rc_pattern2), "query"]
ss <- process_and_subset_RSAT(rsat_results, pattern1, pattern2, number1)
os <- process_and_subset_RSAT(rsat_results, pattern1, rc_pattern2, number2)
if(nrow(ss) > 0) {
ss$query_status <- paste0("same_strand_", pattern2)
} else {
ss <- data.frame(matrix(NA, ncol = ncol(ss), nrow = 1)) # Fill ss dataframe with NA values for all columns
colnames(ss) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
ss$query_status <- paste0("same_strand_", pattern2)
}
if(nrow(os) > 0) {
os$query_status <- paste0("opposite_strand_", pattern2)
} else {
os <- data.frame(matrix(NA, ncol = ncol(os), nrow = 1)) # Fill os dataframe with NA values for all columns
colnames(os) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
os$query_status <- paste0("opposite_strand_", pattern2)
}
df.plot=rbind(ss, os)
df.plot$query_status = factor(df.plot$query_status, levels = c(paste0("same_strand_", pattern2), paste0("opposite_strand_", pattern2)))
# xy plot
pdf(paste0('xy_RSAT_dyad_closest_2nd_', pattern2, '_to_closest_1st_', pattern1, '_to_GATA3_pos_', motif, '_compare_to_DHS.pdf'), width=10,height=6)
print(xyplot(ratio ~ relative_distance,
data = df.plot,
groups = query_status,
auto.key=list(space="right", points=TRUE),
aspect = 1,
xlim=c(-1,30),
ylim=c(0, 500),
xlab = paste0("distance (bp) from 2nd closest ",pattern2, " to closest ", pattern1),
ylab="RSAT obs/exp Ratio",
main=paste0("GATA3 peak with ", motif),
between=list(y=1.0),
scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
panel.xyplot(x, y,
col=c("orange","darkgreen"),
pch=18,
cex=0.6,...)
#panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
}))
dev.off()
}
}
}
## [1] "motif_1"
## [1] "AGA"
## [1] "TAA"
## 3-letter DNAString object
## seq: TTA
## [1] "motif_4"
## [1] "AGA"
## [1] "TAA"
## 3-letter DNAString object
## seq: TTA
## [1] "motif_2"
## [1] "AGA"
## [1] "TAA"
## 3-letter DNAString object
## seq: TTA
## [1] "motif_6"
## [1] "AGA"
## [1] "TAA"
## 3-letter DNAString object
## seq: TTA
## [1] "motif_5"
## [1] "AGA"
## [1] "TAA"
## 3-letter DNAString object
## seq: TTA
Figure 7: xy plot of rsat analysis for GATA3 peak with motif4: 3mer structure
All enrichment graphs show enrichment of a dyad structure with AGA 6bp distal from the opposite strand TAA, which is a [AGA][TTA]TC. With a relative distance from G to C being 6bp.
We did not observe enrichment at a relative distance of 0 between AGA and TAA on the same strand. This is because a relative distance of 0 between two zinc fingers implies that only one zinc finger is binding in that location, indicating the absence of a dyad structure. Similarly, when we did not observe enrichment for the AGATAA structure in our graph, it’s because AGATAA is considered a single binding site, rather than two separate motifs.
library(lattice)
library(latticeExtra)
library(Biostrings)
my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
Anchor_dyad = c("GAT")
Query_dyad =c("ATC")
compute_dis_df=read.csv('pattern_anchor_at_G_compute_dis.csv')
for (motif in my_motifs) {
print(motif)
rsat_results=paste0("GATA3_peak_161win_with_", motif, "_RSAT_dyad.txt")
for (pattern1 in Anchor_dyad){
print(pattern1)
for (pattern2 in Query_dyad){
print(pattern2)
number1=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==pattern2, "query"]
rc_pattern2=reverseComplement(DNAString(pattern2))
print(rc_pattern2)
number2=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==as.character(rc_pattern2), "query"]
ss <- process_and_subset_RSAT(rsat_results, pattern1, pattern2, number1)
os <- process_and_subset_RSAT(rsat_results, pattern1, rc_pattern2, number2)
if(nrow(ss) > 0) {
ss$query_status <- paste0("same_strand_", pattern2)
} else {
ss <- data.frame(matrix(NA, ncol = ncol(ss), nrow = 1)) # Fill ss dataframe with NA values for all columns
colnames(ss) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
ss$query_status <- paste0("same_strand_", pattern2)
}
if(nrow(os) > 0) {
os$query_status <- paste0("opposite_strand_", pattern2)
} else {
os <- data.frame(matrix(NA, ncol = ncol(os), nrow = 1)) # Fill os dataframe with NA values for all columns
colnames(os) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
os$query_status <- paste0("opposite_strand_", pattern2)
}
df.plot=rbind(ss, os)
df.plot$query_status = factor(df.plot$query_status, levels = c(paste0("same_strand_", pattern2), paste0("opposite_strand_", pattern2)))
# xy plot
png(paste0('xy_RSAT_dyad_closest_2nd_', pattern2, '_to_closest_1st_', pattern1, '_to_GATA3_pos_', motif, '_compare_to_DHS.png'))
print(xyplot(ratio ~ relative_distance,
data = df.plot,
groups = query_status,
auto.key=list(space="right", points=TRUE),
aspect = 1,
xlim=c(-1,30),
ylim=c(0, 500),
xlab = paste0("distance (bp) from 2nd closest ",pattern2, " to closest ", pattern1),
ylab="RSAT obs/exp Ratio",
main=paste0("GATA3 peak with ", motif),
between=list(y=1.0),
scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
panel.xyplot(x, y,
col=c("orange","darkgreen"),
pch=18,
cex=0.6,...)
#panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
}))
dev.off()
}
}
}
## [1] "motif_1"
## [1] "GAT"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_4"
## [1] "GAT"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_2"
## [1] "GAT"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_6"
## [1] "GAT"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_5"
## [1] "GAT"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
GATA3 peak with motif1
Figure 8: xy plot of rsat analysis for GATA3 peak with motif1: 3mer structure
For GATA3 peaks with motif1, we have seen an enriched dyad structure of a GAT with 8bp relative distance to its same strand ATC.
This is same as the RSAT defined GATn{3}ATC which rank as second highest ratio. And also match the motif1 PWMs/seqlogo.
GATA3 peak with motif2
Figure 9: xy plot of rsat analysis for GATA3 peak with motif2: 3mer structure
For GATA3 peaks with motif2, we have seen an enriched dyad structure of a GAT with 9bp relative distance to its same strand ATC.
This is same as the RSAT defined GATn{4}ATC which rank as second highest ratio. And also match the motif2 PWMs/seqlogo.
Figure 10: xy plot of rsat analysis for GATA3 peak with motif4: 3mer structure
For GATA3 peaks with motif4, we have seen an enriched dyad structure of a GAT with 8bp relative distance to its opposite strand ATC (same as the same strand GAT).
This is same as the RSAT defined atcn{5}atc/gatn{5}gat which rank as second highest ratio. And also match the motif4 PWMs/seqlogo.
Figure 11: xy plot of rsat analysis for GATA3 peak with motif6: 3mer structure
For GATA3 peaks with motif6, we have seen an enriched dyad structure of a GAT with 9bp relative distance to its opposite strand ATC.
This is same as the RSAT defined atcn{6}atc|gatn{6}gat which rank as second highest ratio. And also match the motif2 PWMs/seqlogo.
Figure 12: xy plot of rsat analysis for GATA3 peak with motif5: 3mer structure
For GATA3 peaks with motif5, we did not see a very enriched dyad structure for GAT relative to +/- ATC. The relative distance at 5bp may or may not be enriched (afterall the ratio seems to be below 25). This is expected, because motif5 has a specific structure of ATCn{1}GAT, which ranked as 2nd highest ratio in the rsat results. This structure is different than GAT relative to +/- ATC.
library(lattice)
library(latticeExtra)
library(Biostrings)
my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
Anchor_dyad = c("ATC")
Query_dyad =c("GAT")
compute_dis_df=read.csv('pattern_anchor_at_G_compute_dis.csv')
for (motif in my_motifs) {
print(motif)
rsat_results=paste0("GATA3_peak_161win_with_", motif, "_RSAT_dyad.txt")
for (pattern1 in Anchor_dyad){
print(pattern1)
for (pattern2 in Query_dyad){
print(pattern2)
number1=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==pattern2, "query"]
rc_pattern2=reverseComplement(DNAString(pattern2))
print(rc_pattern2)
number2=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==as.character(rc_pattern2), "query"]
ss <- process_and_subset_RSAT(rsat_results, pattern1, pattern2, number1)
os <- process_and_subset_RSAT(rsat_results, pattern1, rc_pattern2, number2)
if(nrow(ss) > 0) {
ss$query_status <- paste0("same_strand_", pattern2)
} else {
ss <- data.frame(matrix(NA, ncol = ncol(ss), nrow = 1)) # Fill ss dataframe with NA values for all columns
colnames(ss) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
ss$query_status <- paste0("same_strand_", pattern2)
}
if(nrow(os) > 0) {
os$query_status <- paste0("opposite_strand_", pattern2)
} else {
os <- data.frame(matrix(NA, ncol = ncol(os), nrow = 1)) # Fill os dataframe with NA values for all columns
colnames(os) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
os$query_status <- paste0("opposite_strand_", pattern2)
}
df.plot=rbind(ss, os)
df.plot$query_status = factor(df.plot$query_status, levels = c(paste0("same_strand_", pattern2), paste0("opposite_strand_", pattern2)))
# xy plot
png(paste0('xy_RSAT_dyad_closest_2nd_', pattern2, '_to_closest_1st_', pattern1, '_to_GATA3_pos_', motif, '_compare_to_DHS.png'))
print(xyplot(ratio ~ relative_distance,
data = df.plot,
groups = query_status,
auto.key=list(space="right", points=TRUE),
aspect = 1,
xlim=c(-1,30),
ylim=c(0, 500),
xlab = paste0("distance (bp) from 2nd closest ",pattern2, " to closest ", pattern1),
ylab="RSAT obs/exp Ratio",
main=paste0("GATA3 peak with ", motif),
between=list(y=1.0),
scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
panel.xyplot(x, y,
col=c("orange","darkgreen"),
pch=18,
cex=0.6,...)
#panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
}))
dev.off()
}
}
}
## [1] "motif_1"
## [1] "ATC"
## [1] "GAT"
## 3-letter DNAString object
## seq: ATC
## [1] "motif_4"
## [1] "ATC"
## [1] "GAT"
## 3-letter DNAString object
## seq: ATC
## [1] "motif_2"
## [1] "ATC"
## [1] "GAT"
## 3-letter DNAString object
## seq: ATC
## [1] "motif_6"
## [1] "ATC"
## [1] "GAT"
## 3-letter DNAString object
## seq: ATC
## [1] "motif_5"
## [1] "ATC"
## [1] "GAT"
## 3-letter DNAString object
## seq: ATC
GATA3 peak with motif5
Figure 13: xy plot of rsat analysis for GATA3 peak with motif6: 3mer structure
As previous mentioned, now we see an enriched dyad structure of a ATC with 2bp relative distance to its opposite strand GAT, for peaks with motif5.
This is same as the RSAT defined atcn{1}gat which rank as second highest ratio. And also match the motif5 PWMs/seqlogo.
This structure is only enriched in GATA3 peaks with motif1 and motif2 according to RSAT results.
library(lattice)
library(latticeExtra)
library(Biostrings)
my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
Anchor_dyad = c("ATA")
Query_dyad =c("ATC")
compute_dis_df=read.csv('pattern_anchor_at_G_compute_dis.csv')
for (motif in my_motifs) {
print(motif)
rsat_results=paste0("GATA3_peak_161win_with_", motif, "_RSAT_dyad.txt")
for (pattern1 in Anchor_dyad){
print(pattern1)
for (pattern2 in Query_dyad){
print(pattern2)
number1=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==pattern2, "query"]
rc_pattern2=reverseComplement(DNAString(pattern2))
print(rc_pattern2)
number2=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==as.character(rc_pattern2), "query"]
ss <- process_and_subset_RSAT(rsat_results, pattern1, pattern2, number1)
os <- process_and_subset_RSAT(rsat_results, pattern1, rc_pattern2, number2)
if(nrow(ss) > 0) {
ss$query_status <- paste0("same_strand_", pattern2)
} else {
ss <- data.frame(matrix(NA, ncol = ncol(ss), nrow = 1)) # Fill ss dataframe with NA values for all columns
colnames(ss) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
ss$query_status <- paste0("same_strand_", pattern2)
}
if(nrow(os) > 0) {
os$query_status <- paste0("opposite_strand_", pattern2)
} else {
os <- data.frame(matrix(NA, ncol = ncol(os), nrow = 1)) # Fill os dataframe with NA values for all columns
colnames(os) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
os$query_status <- paste0("opposite_strand_", pattern2)
}
df.plot=rbind(ss, os)
df.plot$query_status = factor(df.plot$query_status, levels = c(paste0("same_strand_", pattern2), paste0("opposite_strand_", pattern2)))
# xy plot
png(paste0('xy_RSAT_dyad_closest_2nd_', pattern2, '_to_closest_1st_', pattern1, '_to_GATA3_pos_', motif, '_compare_to_DHS.png'))
print(xyplot(ratio ~ relative_distance,
data = df.plot,
groups = query_status,
auto.key=list(space="right", points=TRUE),
aspect = 1,
xlim=c(-1,30),
ylim=c(0, 500),
xlab = paste0("distance (bp) from 2nd closest ",pattern2, " to closest ", pattern1),
ylab="RSAT obs/exp Ratio",
main=paste0("GATA3 peak with ", motif),
between=list(y=1.0),
scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
panel.xyplot(x, y,
col=c("orange","darkgreen"),
pch=18,
cex=0.6,...)
#panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
}))
dev.off()
}
}
}
## [1] "motif_1"
## [1] "ATA"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_4"
## [1] "ATA"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_2"
## [1] "ATA"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_6"
## [1] "ATA"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_5"
## [1] "ATA"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
Figure 14: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
Figure 15: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
Figure 16: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
Figure 17: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
Figure 18: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
It is clear in the enrichment graph that GATA3 peaks with motif1 has an enriched dyad structure of ATAn{2}ATC (relative distance of two zinc fingers is 8bp); GATA3 peaks with motif2 has an enriched dyad structure of ATAn{3}ATC (relative distance of two zinc finger is 9bp).
For GATA3 peaks with motif4, ATAn{0}GAT and ATA{4}GAT are enriched. The relative distances of two zinc fingers are at 4bp and 8bp.
No remarkable enrichment of dyad structure related to ATA and ATC for GATA3 peaks with motif5 and motif6.
Notice that this is anchoring at ATC and looking for ATC on same strand or opposite strand.
library(lattice)
library(latticeExtra)
library(Biostrings)
my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
Anchor_dyad = c("ATC")
Query_dyad =c("ATC")
compute_dis_df=read.csv('pattern_anchor_at_G_compute_dis.csv')
for (motif in my_motifs) {
print(motif)
rsat_results=paste0("GATA3_peak_161win_with_", motif, "_RSAT_dyad.txt")
for (pattern1 in Anchor_dyad){
print(pattern1)
for (pattern2 in Query_dyad){
print(pattern2)
number1=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==pattern2, "query"]
rc_pattern2=reverseComplement(DNAString(pattern2))
print(rc_pattern2)
number2=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==as.character(rc_pattern2), "query"]
ss <- process_and_subset_RSAT(rsat_results, pattern1, pattern2, number1)
os <- process_and_subset_RSAT(rsat_results, pattern1, rc_pattern2, number2)
if(nrow(ss) > 0) {
ss$query_status <- paste0("same_strand_", pattern2)
} else {
ss <- data.frame(matrix(NA, ncol = ncol(ss), nrow = 1)) # Fill ss dataframe with NA values for all columns
colnames(ss) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
ss$query_status <- paste0("same_strand_", pattern2)
}
if(nrow(os) > 0) {
os$query_status <- paste0("opposite_strand_", pattern2)
} else {
os <- data.frame(matrix(NA, ncol = ncol(os), nrow = 1)) # Fill os dataframe with NA values for all columns
colnames(os) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
os$query_status <- paste0("opposite_strand_", pattern2)
}
df.plot=rbind(ss, os)
df.plot$query_status = factor(df.plot$query_status, levels = c(paste0("same_strand_", pattern2), paste0("opposite_strand_", pattern2)))
# xy plot
png(paste0('xy_RSAT_dyad_closest_2nd_', pattern2, '_to_closest_1st_', pattern1, '_to_GATA3_pos_', motif, '_compare_to_DHS.png'))
print(xyplot(ratio ~ relative_distance,
data = df.plot,
groups = query_status,
auto.key=list(space="right", points=TRUE),
aspect = 1,
xlim=c(-1,30),
ylim=c(0, 500),
xlab = paste0("distance (bp) from 2nd closest ",pattern2, " to closest ", pattern1),
ylab="RSAT obs/exp Ratio",
main=paste0("GATA3 peak with ", motif),
between=list(y=1.0),
scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
panel.xyplot(x, y,
col=c("orange","darkgreen"),
pch=18,
cex=0.6,...)
#panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
}))
dev.off()
}
}
}
## [1] "motif_1"
## [1] "ATC"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_4"
## [1] "ATC"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_2"
## [1] "ATC"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_6"
## [1] "ATC"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
## [1] "motif_5"
## [1] "ATC"
## [1] "ATC"
## 3-letter DNAString object
## seq: GAT
Figure 19: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
Figure 20: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
Figure 21: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
Figure 22: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
Figure 23: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
As expected, GATA3 peak with motif 1 and motif 2 do not show enrichment for dyad structure made of this pair of 3mer.
GATA3 peak with motif4 has enrichment at relative distance 8bp for ATC and same strand ATC;
GATA3 peak with motif6 has enrichment at relative distance 9bp for ATC and same strand ATC;
GATA3 peak with motif5 has enrichment at relative distance 2bp for ATC and opposite strand ATC.
These all match with the defined motif structure for each peak set.
Unfortunately, TAG is not included in our prioritized list. It is ranked even after AGA/TCT. We can include an analysis of this 3-mer here.
And in our customized analysis, we might also include TAG/CTA to our prioritized 3mer list.
library(lattice)
library(latticeExtra)
library(Biostrings)
my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
Anchor_dyad = c("TAG")
Query_dyad =c("ATA")
compute_dis_df=read.csv('pattern_anchor_at_G_compute_dis.csv')
for (motif in my_motifs) {
print(motif)
rsat_results=paste0("GATA3_peak_161win_with_", motif, "_RSAT_dyad.txt")
for (pattern1 in Anchor_dyad){
print(pattern1)
for (pattern2 in Query_dyad){
print(pattern2)
number1=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==pattern2, "query"]
rc_pattern2=reverseComplement(DNAString(pattern2))
print(rc_pattern2)
number2=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==as.character(rc_pattern2), "query"]
ss <- process_and_subset_RSAT(rsat_results, pattern1, pattern2, number1)
os <- process_and_subset_RSAT(rsat_results, pattern1, rc_pattern2, number2)
if(nrow(ss) > 0) {
ss$query_status <- paste0("same_strand_", pattern2)
} else {
ss <- data.frame(matrix(NA, ncol = ncol(ss), nrow = 1)) # Fill ss dataframe with NA values for all columns
colnames(ss) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
ss$query_status <- paste0("same_strand_", pattern2)
}
if(nrow(os) > 0) {
os$query_status <- paste0("opposite_strand_", pattern2)
} else {
os <- data.frame(matrix(NA, ncol = ncol(os), nrow = 1)) # Fill os dataframe with NA values for all columns
colnames(os) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
os$query_status <- paste0("opposite_strand_", pattern2)
}
df.plot=rbind(ss, os)
df.plot$query_status = factor(df.plot$query_status, levels = c(paste0("same_strand_", pattern2), paste0("opposite_strand_", pattern2)))
# xy plot
png(paste0('xy_RSAT_dyad_closest_2nd_', pattern2, '_to_closest_1st_', pattern1, '_to_GATA3_pos_', motif, '_compare_to_DHS.png'))
print(xyplot(ratio ~ relative_distance,
data = df.plot,
groups = query_status,
auto.key=list(space="right", points=TRUE),
aspect = 1,
xlim=c(-1,30),
ylim=c(0, 500),
xlab = paste0("distance (bp) from 2nd closest ",pattern2, " to closest ", pattern1),
ylab="RSAT obs/exp Ratio",
main=paste0("GATA3 peak with ", motif),
between=list(y=1.0),
scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
panel.xyplot(x, y,
col=c("orange","darkgreen"),
pch=18,
cex=0.6,...)
#panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
}))
dev.off()
}
}
}
## [1] "motif_1"
## [1] "TAG"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_4"
## [1] "TAG"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_2"
## [1] "TAG"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_6"
## [1] "TAG"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_5"
## [1] "TAG"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
Figure 24: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
Figure 25: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
Figure 26: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
Figure 27: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
Figure 28: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
It is interesting to see that for all peak set, we did not have dyad structure of TAG{}TAT on the same strand.
And for TAG-ATA on the same strand, we only see enrichment for GATA3 peaks with motif4 (rsat calculate the ratio is 68.72 ranked the third for tagn{0}ata). The relative distance between two zinc finger is 4. The binding element has to look like this: GATAGATA.
library(lattice)
library(latticeExtra)
library(Biostrings)
my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
Anchor_dyad = c("ATC")
Query_dyad =c("ATA")
compute_dis_df=read.csv('pattern_anchor_at_G_compute_dis.csv')
for (motif in my_motifs) {
print(motif)
rsat_results=paste0("GATA3_peak_161win_with_", motif, "_RSAT_dyad.txt")
for (pattern1 in Anchor_dyad){
print(pattern1)
for (pattern2 in Query_dyad){
print(pattern2)
number1=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==pattern2, "query"]
rc_pattern2=reverseComplement(DNAString(pattern2))
print(rc_pattern2)
number2=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==as.character(rc_pattern2), "query"]
ss <- process_and_subset_RSAT(rsat_results, pattern1, pattern2, number1)
os <- process_and_subset_RSAT(rsat_results, pattern1, rc_pattern2, number2)
if(nrow(ss) > 0) {
ss$query_status <- paste0("same_strand_", pattern2)
} else {
ss <- data.frame(matrix(NA, ncol = ncol(ss), nrow = 1)) # Fill ss dataframe with NA values for all columns
colnames(ss) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
ss$query_status <- paste0("same_strand_", pattern2)
}
if(nrow(os) > 0) {
os$query_status <- paste0("opposite_strand_", pattern2)
} else {
os <- data.frame(matrix(NA, ncol = ncol(os), nrow = 1)) # Fill os dataframe with NA values for all columns
colnames(os) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
os$query_status <- paste0("opposite_strand_", pattern2)
}
df.plot=rbind(ss, os)
df.plot$query_status = factor(df.plot$query_status, levels = c(paste0("same_strand_", pattern2), paste0("opposite_strand_", pattern2)))
# xy plot
png(paste0('xy_RSAT_dyad_closest_2nd_', pattern2, '_to_closest_1st_', pattern1, '_to_GATA3_pos_', motif, '_compare_to_DHS.png'))
print(xyplot(ratio ~ relative_distance,
data = df.plot,
groups = query_status,
auto.key=list(space="right", points=TRUE),
aspect = 1,
xlim=c(-1,30),
ylim=c(0, 500),
xlab = paste0("distance (bp) from 2nd closest ",pattern2, " to closest ", pattern1),
ylab="RSAT obs/exp Ratio",
main=paste0("GATA3 peak with ", motif),
between=list(y=1.0),
scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
panel.xyplot(x, y,
col=c("orange","darkgreen"),
pch=18,
cex=0.6,...)
#panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
}))
dev.off()
}
}
}
## [1] "motif_1"
## [1] "ATC"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_4"
## [1] "ATC"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_2"
## [1] "ATC"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_6"
## [1] "ATC"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_5"
## [1] "ATC"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
Figure 29: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
Figure 30: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
Figure 31: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
Figure 32: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
Figure 33: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
In these graph, we also did not see structure of ATC-TAT(opposite strand ATA). And only see one enriched structure for ATC-ATA at relative distance==2bp. This structure must be ATCxGATA, which match with motif5.
library(lattice)
library(latticeExtra)
library(Biostrings)
my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
Anchor_dyad = c("GAT")
Query_dyad =c("ATA")
compute_dis_df=read.csv('pattern_anchor_at_G_compute_dis.csv')
for (motif in my_motifs) {
print(motif)
rsat_results=paste0("GATA3_peak_161win_with_", motif, "_RSAT_dyad.txt")
for (pattern1 in Anchor_dyad){
print(pattern1)
for (pattern2 in Query_dyad){
print(pattern2)
number1=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==pattern2, "query"]
rc_pattern2=reverseComplement(DNAString(pattern2))
print(rc_pattern2)
number2=compute_dis_df[compute_dis_df$pattern==pattern1, "anchor"] + compute_dis_df[compute_dis_df$pattern==as.character(rc_pattern2), "query"]
ss <- process_and_subset_RSAT(rsat_results, pattern1, pattern2, number1)
os <- process_and_subset_RSAT(rsat_results, pattern1, rc_pattern2, number2)
if(nrow(ss) > 0) {
ss$query_status <- paste0("same_strand_", pattern2)
} else {
ss <- data.frame(matrix(NA, ncol = ncol(ss), nrow = 1)) # Fill ss dataframe with NA values for all columns
colnames(ss) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
ss$query_status <- paste0("same_strand_", pattern2)
}
if(nrow(os) > 0) {
os$query_status <- paste0("opposite_strand_", pattern2)
} else {
os <- data.frame(matrix(NA, ncol = ncol(os), nrow = 1)) # Fill os dataframe with NA values for all columns
colnames(os) <- c("first", "dyad_distance", "second", "ratio","relative_distance")
os$query_status <- paste0("opposite_strand_", pattern2)
}
df.plot=rbind(ss, os)
df.plot$query_status = factor(df.plot$query_status, levels = c(paste0("same_strand_", pattern2), paste0("opposite_strand_", pattern2)))
# xy plot
png(paste0('xy_RSAT_dyad_closest_2nd_', pattern2, '_to_closest_1st_', pattern1, '_to_GATA3_pos_', motif, '_compare_to_DHS.png'))
print(xyplot(ratio ~ relative_distance,
data = df.plot,
groups = query_status,
auto.key=list(space="right", points=TRUE),
aspect = 1,
xlim=c(-1,30),
ylim=c(0, 500),
xlab = paste0("distance (bp) from 2nd closest ",pattern2, " to closest ", pattern1),
ylab="RSAT obs/exp Ratio",
main=paste0("GATA3 peak with ", motif),
between=list(y=1.0),
scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.symbol = list(col=c("orange", "darkgreen"), pch=18, lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
panel.xyplot(x, y,
col=c("orange","darkgreen"),
pch=18,
cex=0.6,...)
#panel.barchart(x,y, horizontal = FALSE, col=c("red", "blue"), alpha=0.4)
}))
dev.off()
}
}
}
## [1] "motif_1"
## [1] "GAT"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_4"
## [1] "GAT"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_2"
## [1] "GAT"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_6"
## [1] "GAT"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
## [1] "motif_5"
## [1] "GAT"
## [1] "ATA"
## 3-letter DNAString object
## seq: TAT
Figure 34: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
Figure 35: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
Figure 36: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
Figure 37: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
Figure 38: xy plot of rsat analysis for GATA3 peak with motif12456: 3mer structure
Notice that we again did not see dyad structures of GAT-TAT(opposite stand ATA), but has identified enriched structure of GAT-ATA for GATA3 peaks with motif4 and motif6.
For GATA3 peaks with motif4, we see gatn{6}ata and gatn{2}ata enriched. The relative distance between two zinc fingers is 8bp and 4bp. The 8bp one match with moyif4, the 4bp one match with GATAGATA.
For GATA3 peaks with motif6, we see gatn{7}ata enriched, the relative distance between two zinc fingers is 9bp. This match with motif6 PWMs.
For the added 2 pairs of 3mer:
I have add the relative distance to the pattern_anchor_at_G_compute_dis.csv file.
pattern identifier anchor query
AGA AGA(t) 2 1
TCT (a)TCT 2 1
TAG (ga)TAG 5 -2
CTA CTA(tc) -1 4
Overall, RSAT-dyad analysis proves effective in identifying enriched binding elements within a set of peak regions, with proper controls and parameter settings.
Enriched dyad structures can be discerned through various combinations of 3-mers. Establishing the relative distance between zinc fingers serves as a valuable method to determine if a structure arises from the same binding element.
Goal: given a prioritized 3mer list, we can generate the closest 3mer coordinates to a given sets of peak summits.
240208_closestBed.R:
(cd /home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/closest_other_3mer)
#!/usr/bin/env Rscript
Args=commandArgs(TRUE)
# closestBed function
bedTools.closest <- function(functionstring="/home/FCAM/ssun/packages/bedtools2/bin/closestBed",bed1,bed2,opt.string="") {
options(scipen =99) # not use scientific notation when writing out
#write bed formatted data.frames to tempfile
write.table(bed1,file= 'a.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
write.table(bed2,file= 'b.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
# create the command string and call the command using system()
# the command sort a and b file by coordinates
command1=paste('sort -k1,1 -k2,2n', 'a.file.bed', '> a.file.sorted.bed')
cat(command1,"\n") #sort -k1,1 -k2,2n a.file.bed > a.file.sorted.bed
try(system(command1))
command2=paste('sort -k1,1 -k2,2n', 'b.file.bed', '> b.file.sorted.bed')
cat(command2,"\n")
try(system(command2))
# the command call closestBed on bed1 and bed2
command=paste(functionstring, opt.string,"-a",'a.file.sorted.bed',"-b",'b.file.sorted.bed',">",'out.file.bed',sep=" ")
cat(command,"\n")
try(system(command))
res=read.table('out.file.bed',header=F, comment.char='')
# remove intermediate files
command3=paste('rm', 'a.file.bed', 'b.file.bed', 'a.file.sorted.bed', 'b.file.sorted.bed', 'out.file.bed')
cat(command3,"\n")
try(system(command3))
colnames(res) = c(colnames(bed1), colnames(bed2), 'dis' )
return(res)
}
dir1="/labs/Guertin/siyu/Sathyan_GATA3_ChIP_pool1_pool2/overrep_3mer/hg38_full_kmer3_rs1000/seqdump/"
dir2="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/MAST_positive_control/"
dir3="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/ENCODE_DHS_GSE29692/"
my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
prioritized_triplets = c("AAA", "TAA" ,"ATA" ,"TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")
library(bigWig)
for (triplet in prioritized_triplets){
print(triplet)
# 3mer genome coordinates
plus.triplet.file=read.table(file = Sys.glob(file.path(paste0(dir1,"hg38.3.3.3plus.*_",triplet, ".bed"))), sep="\t", header=FALSE)
minus.triplet.file=read.table(file = Sys.glob(file.path(paste0(dir1,"hg38.3.3.3minus.*_",triplet ,".bed"))), sep="\t", header=FALSE)
for (motif in my_motifs){
print(motif)
# peak summits
GATA3_peak_summits=center.bed(read.table(paste0(dir2, "GATA3_peak_161win_with_",motif, ".bed"), header=FALSE), upstreamWindow = 0, downstreamWindow = 0)
# consensus neg
indep.DHS.control.consensus=center.bed(read.table(paste0(dir3, "MCF7DHS_consensus_noGATA_without_motifs_123456_78.bed"), header=FALSE), upstreamWindow = 0, downstreamWindow = 0)
# closestBed--1st closest plus
##
closest.1st.plus.triplet.to.peak=bedTools.closest(bed1 = GATA3_peak_summits[,1:3], bed2 = plus.triplet.file, opt.string = '-d -t first')
write.table(closest.1st.plus.triplet.to.peak,file= paste0('closest.1st.plus.',triplet,'.to.GATA3.with.', motif,'.bed'), quote=F,sep="\t",col.names=F,row.names=F)
##
closest.1st.plus.triplet.to.indep.DHS.control.consensus=bedTools.closest(bed1 = indep.DHS.control.consensus[,1:3], bed2 = plus.triplet.file, opt.string = '-d -t first')
write.table(closest.1st.plus.triplet.to.indep.DHS.control.consensus,file= paste0('closest.1st.plus.',triplet,'.to.indep.DHS.control.consensus.bed'), quote=F,sep="\t",col.names=F,row.names=F)
# closestBed--1st closest minus
##
closest.1st.minus.triplet.to.peak=bedTools.closest(bed1 = GATA3_peak_summits[,1:3], bed2 =minus.triplet.file, opt.string = '-d -t first')
write.table(closest.1st.minus.triplet.to.peak,file= paste0('closest.1st.minus.',triplet,'.to.GATA3.with.', motif,'.bed'), quote=F,sep="\t",col.names=F,row.names=F)
##
closest.1st.minus.triplet.to.indep.DHS.control.consensus=bedTools.closest(bed1 = indep.DHS.control.consensus[,1:3], bed2 = minus.triplet.file, opt.string = '-d -t first')
write.table(closest.1st.minus.triplet.to.indep.DHS.control.consensus,file= paste0('closest.1st.minus.',triplet,'.to.indep.DHS.control.consensus.bed'), quote=F,sep="\t",col.names=F,row.names=F)
}
}
runR.sh
#!/bin/bash
#SBATCH --job-name=runR.sh # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 8
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=200G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o runR.sh_%j.out
#SBATCH -e runR.sh_%j.err
module load R/4.1.2
Rscript 240208_closestBed.R
Overall Goal: given the closest 3mer coordinates file, we can loop through a set of 3mer list and generate the 2nd closest 3mer coordinates to the provided closest 3mer coordinates.
sort_3mer_cor.sh
#!/bin/bash
#SBATCH --job-name=sort_3mer_cor.sh
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 4
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=32G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o sort_3mer_cor.sh_%j.out
#SBATCH -e sort_3mer_cor.sh_%j.err
hostname
name=XXXXXX
input_dir1=/labs/Guertin/siyu/Sathyan_GATA3_ChIP_pool1_pool2/overrep_3mer/hg38_full_kmer3_rs1000/seqdump/
sort -k1,1 -k2,2n ${input_dir1}hg38.3.3.3${name}.bed > hg38.3.3.3${name}.sorted.bed
file=sort_3mer_cor.sh
for i in hg38.3.3.3*.bed
do
nm=$(echo $i | awk -F"/" '{print $NF}' | awk -F"hg38.3.3.3" '{print $2}' | awk -F".bed" '{print $1}')
echo $nm
sed -e "s/XXXXXX/${nm}/g" "$file" > sort_hg38.3.3.3${nm}.sh
sbatch sort_hg38.3.3.3${nm}.sh
sleep 1
done
bedtools subtract change the internal order of file#ls /labs/Guertin/siyu/Sathyan_GATA3_ChIP_pool1_pool2/overrep_3mer/hg38_full_kmer3_rs1000/seqdump/*.sorted.bed
#subset a test file
awk '$1 == "chr4"' /labs/Guertin/siyu/Sathyan_GATA3_ChIP_pool1_pool2/overrep_3mer/hg38_full_kmer3_rs1000/seqdump/hg38.3.3.3plus.36_GAT.sorted.bed > hg38.3.3.3plus.36_GAT_subset_chr4.sorted.bed
wc -l hg38.3.3.3plus.36_GAT_subset_chr4.sorted.bed #2583363
wc -l /home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/closest_other_3mer/closest.1st.plus.GAT.to.GATA3.with.motif_1.bed #12470
awk '{print $4, $5, $6, $7, $8, $9, $10}' /home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/closest_other_3mer/closest.1st.plus.GAT.to.GATA3.with.motif_1.bed | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.plus.GAT.to.GATA3.with.motif_1.uniq.sorted.bed
wc -l closest.1st.plus.GAT.to.GATA3.with.motif_1.uniq.sorted.bed #12461 #less than the original file because original file has two different peaks assigned with same closest 3mer coordinates
module load bedtools
bedtools subtract -a hg38.3.3.3plus.36_GAT_subset_chr4.sorted.bed -b closest.1st.plus.GAT.to.GATA3.with.motif_1.uniq.sorted.bed -f 1.00 -s > substract_output.bed #2582612
sort -k1,1 -k2,2n substract_output.bed > substract_output.sorted.bed
diff substract_output.bed substract_output.sorted.bed
# no output printed on the screen, the two file are the same.
bedtools subtract does not change the order of the input files.
Remove the first closest GAT with bedtools subtract
Subtract the 1st closest GAT from all.GAT, then find the closest 2nd GAT to the closest 1st GAT.
-f Requiring a minimal overlap fraction before subtracting. Here we define -f 1.00 to make sure of a 100% overlap between two file before subtracting.
-s Enforcing same “strandedness” while scanning for features in -b file that should be subtracted from -a file.
#!/bin/bash
#SBATCH --job-name=remove_1st_3mer.sh # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 16
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=200G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o remove_1st_3mer.sh_%j.out
#SBATCH -e remove_1st_3mer.sh_%j.err
input_dir1=/labs/Guertin/siyu/Sathyan_GATA3_ChIP_pool1_pool2/overrep_3mer/hg38_full_kmer3_rs1000/seqdump/
input_dir2=/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/closest_other_3mer/
#output_dir=/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/closest_other_3mer/closest_2nd_other_3mer/
my_motifs=("motif_1" "motif_2" "motif_4" "motif_5" "motif_6")
prioritized_triplets=("AAA" "TAA" "ATA" "TTA" "AAT" "TAT" "GAT" "ATT" "TTT" "ATC" "AGA" "TCT" "TAG" "CTA")
# Use a for loop to iterate over the
module load bedtools
for triplet in "${prioritized_triplets[@]}"
do
echo $triplet
for motif in "${my_motifs[@]}"
do
echo $motif
# plus
awk '{print $4, $5, $6, $7, $8, $9, $10}' ${input_dir2}closest.1st.plus.${triplet}.to.GATA3.with.${motif}.bed | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.plus.${triplet}.to.GATA3.with.${motif}.uniq.sorted.bed
bedtools subtract -a ${input_dir1}hg38.3.3.3plus.${triplet}.sorted.bed -b closest.1st.plus.${triplet}.to.GATA3.with.${motif}.uniq.sorted.bed -f 1.00 -s > hg38.3.3.3plus.${triplet}_without_1st_plus_${triplet}_to_GATA3_with_${motif}.bed
rm closest.1st.plus.${triplet}.to.GATA3.with.${motif}.uniq.sorted.bed
# minus
awk '{print $4, $5, $6, $7, $8, $9, $10}' ${input_dir2}closest.1st.minus.${triplet}.to.GATA3.with.${motif}.bed | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.minus.${triplet}.to.GATA3.with.${motif}.uniq.sorted.bed
bedtools subtract -a ${input_dir1}hg38.3.3.3minus.${triplet}.sorted.bed -b closest.1st.minus.${triplet}.to.GATA3.with.${motif}.uniq.sorted.bed -f 1.00 -s > hg38.3.3.3minus.${triplet}_without_1st_minus_${triplet}_to_GATA3_with_${motif}.bed
rm closest.1st.minus.${triplet}.to.GATA3.with.${motif}.uniq.sorted.bed
done
done
# independent DHS control
for triplet in "${prioritized_triplets[@]}"
do
echo $triplet
# plus
awk '{print $4, $5, $6, $7, $8, $9, $10}' ${input_dir2}closest.1st.plus.${triplet}.to.indep.DHS.control.consensus.bed | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.plus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed
bedtools subtract -a ${input_dir1}hg38.3.3.3plus.${triplet}.sorted.bed -b closest.1st.plus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed -f 1.00 -s > hg38.3.3.3plus.36_${triplet}_without_1st_plus_${triplet}_to_indep_DHS_control.bed
rm closest.1st.plus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed
# minus
awk '{print $4, $5, $6, $7, $8, $9, $10}' ${input_dir2}closest.1st.minus.${triplet}.to.indep.DHS.control.consensus.bed | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.minus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed
bedtools subtract -a ${input_dir1}hg38.3.3.3minus.${triplet}.sorted.bed -b closest.1st.minus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed -f 1.00 -s > hg38.3.3.3minus.36_${triplet}_without_1st_minus_${triplet}_to_indep_DHS_control.bed
rm closest.1st.minus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed
done
Identify redundant spaced 3mers, avoid showing duplicated results.
In our prioritized list, we have 14 unique 3mers, which can be described as 7 pairs consisting of a 3mer and its reverse complement.
Pairing them individually would yield 196 combinations. However, this approach conveys redundant information already captured by their same strand’s reverse complement. For instance, the plus strand ATC-TTT is equivalent to plus strand AAA-GAT, and AAA-AAA is equivalent to TTT-TTT on the same strand.
It is important to list all unique combinations of spaced 3-mers so that we do not show redundant information in the downstream analysis.
Additionally, in our calculation of relative distances, it’s essential to account for the upstream and downstream orientation of the 3mers. For example, (gat)AAA-TTA(tc) and TTA(tc)-(gat)AAA represent distinct configurations.
# function to convert the plus strand 6mer to its reverse compliment then convert to same strand
convert_plus_strand_6mer <- function(sequence) {
# Define a dictionary to store complementary base pairs
complement <- c("A" = "T", "T" = "A", "G" = "C", "C" = "G")
# Reverse the input sequence
reversed_sequence <- rev(strsplit(sequence, "")[[1]])
# Get the complementary bases
complementary_sequence <- sapply(reversed_sequence, function(base) complement[base])
# Combine the complementary bases into a string
converted_sequence <- paste(complementary_sequence, collapse = "")
return(converted_sequence)
}
# List of prioritized triplets
prioritized_triplets <- c("AAA", "TAA", "ATA", "TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")
# List to store non-redundant 6mers
non_redundant_6mers <- list()
# Loop through each pair of triplets
for (i in 1:length(prioritized_triplets)) {
for (j in 1:length(prioritized_triplets)) {
pair <- c(prioritized_triplets[i], prioritized_triplets[j])
# Combine triplets to form a 6mer
sixmer <- paste(pair, collapse = "")
# Find reverse complement of the 6mer
converted_reverse_complement_sixmer <- convert_plus_strand_6mer(sixmer)
# Check if the reverse complement exists in the 6mer itself
if (!converted_reverse_complement_sixmer %in% non_redundant_6mers) {
non_redundant_6mers <- c(non_redundant_6mers, list(sixmer))
}
}
}
# Create dataframe with first 3 bases and last 3 bases
first_3_bases <- substr(non_redundant_6mers, 1, 3)
last_3_bases <- substr(non_redundant_6mers, 4, 6)
df <- data.frame(First_3_bases = first_3_bases, Last_3_bases = last_3_bases)
nrow(df)
## [1] 105
The above R code aims to generate non-redundant 6-mers from prioritized triplets and then creating a dataframe to represent them.
First, we define a convert_plus_strand_6mer function to reverse the input DNA sequence, finds the complementary bases for each base, and then combines them into a string representing the reverse complement.
Then we perform a nested loop, which iterates through each pair of prioritized triplets.
For each pair, it concatenates them to form a 6-mer, and then finds the reverse complement of the 6-mer using the convert_plus_strand_6mer function. If the reverse complement is already in non_redundant_6mers, we do not include the 6mer to the final output.
Through running this code, we identified 105 combinations of 3mer pairs. These pairs are unique while considering them to be on the same strand.
However, when considering the identification of 3mer combinations on both the plus and minus strands, it’s crucial to account for these combinations. If we limit our focus solely to 3mer-3mer pairs on the same strand, specifically the plus strand, each of the 196 combinations will be distinct from one another.
Notice that the relative distance here need to be carefully defined according to different 3mer.
I am using the bigWig package to anchor at the specific G/C base for each 3mer. The fiveprime.bed() defines the anchor point based on the strand information:
If strand = '+' while using fiveprime.bed
anchor point = original start
start = anchor point - upstreamwindow
end = anchor point + 1 + downstreamwindow
If strand = '-' while using fiveprime.bed
anchor point = original end
start = anchor point - downstreamwindow
end = anchor point + 1 + upstreamwindow
I have defined the upstream/downstream window value for each 3mer in the prioritized set, in this .csv file:
#pattern_anchor_at_GorC_for_bigWig_pkg.csv
read.csv('pattern_anchor_at_GorC_for_bigWig_pkg.csv')
## pattern identifier plus_upstream plus_downstream minus_upstream
## 1 GAT GAT 0 0 -1
## 2 ATC ATC -2 2 -3
## 3 ATA (g)ATA 1 -1 0
## 4 TAT TAT(c) -3 3 -4
## 5 TTA TTA(tc) -4 4 -5
## 6 TAA (ga)TAA 2 -2 1
## 7 AAT AAT(c) -3 3 -4
## 8 ATT (g)ATT 1 -1 0
## 9 AAA (gat)AAA 3 -3 2
## 10 TTT TTT(atc) -5 5 -6
## 11 AGA AGA(t) -1 1 -2
## 12 TCT (a)TCT -1 1 -2
## 13 TAG (ga)TAG 2 -2 1
## 14 CTA CTA(tc) -4 4 -5
## minus_downstream
## 1 1
## 2 3
## 3 0
## 4 4
## 5 5
## 6 -1
## 7 4
## 8 0
## 9 -2
## 10 6
## 11 2
## 12 2
## 13 -1
## 14 5
Automate analysis on all prioritized 3mers: find the closest 2nd 3mer.
Anchor at plus strand closest 3mer, find the 2nd closest plus strand 3mer relative to the closest 3mer.
240225_closestBed.R
#!/usr/bin/env Rscript
Args=commandArgs(TRUE)
bedTools.closest.mod <- function(functionstring="/home/FCAM/ssun/packages/bedtools2/bin/closestBed",bed1,bed2,opt.string="") {
options(scipen =99) # not use scientific notation when writing out
#write bed formatted data.frames to tempfile
write.table(bed1,file= 'a.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
write.table(bed2,file= 'b.file.sorted.bed', quote=F,sep="\t",col.names=F,row.names=F)
# create the command string and call the command using system()
# the command sort a and b file by coordinates
command1=paste('sort -k1,1 -k2,2n', 'a.file.bed', '> a.file.sorted.bed')
cat(command1,"\n") #sort -k1,1 -k2,2n a.file.bed > a.file.sorted.bed
try(system(command1))
#command2=paste('sort -k1,1 -k2,2n', 'b.file.bed', '> b.file.sorted.bed')
#cat(command2,"\n")
#try(system(command2))
# the command call closestBed on bed1 and bed2
command=paste(functionstring, opt.string,"-a",'a.file.sorted.bed',"-b",'b.file.sorted.bed',">",'out.file.bed',sep=" ")
cat(command,"\n")
try(system(command))
res=read.table('out.file.bed',header=F, comment.char='')
# remove intermediate files
command3=paste('rm', 'a.file.bed', 'a.file.sorted.bed', 'b.file.sorted.bed', 'out.file.bed')
cat(command3,"\n")
try(system(command3))
colnames(res) = c(colnames(bed1), colnames(bed2), 'dis' )
return(res)
}
library(bigWig)
# List of prioritized triplets
prioritized_triplets <- c("AAA", "TAA", "ATA", "TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")
# List to store non-redundant 6mers
all_6mers <- list()
for (i in 1:length(prioritized_triplets)) {
for (j in 1:length(prioritized_triplets)) {
pair <- c(prioritized_triplets[i], prioritized_triplets[j])
# Combine triplets to form a 6mer
sixmer <- paste(pair, collapse = "")
all_6mers <- c(all_6mers, list(sixmer))
}
}
# Create data frame with first 3 bases and last 3 bases
first_3_bases <- substr(all_6mers, 1, 3)
last_3_bases <- substr(all_6mers, 4, 6)
df <- data.frame(First_3_bases = first_3_bases, Last_3_bases = last_3_bases)
# nested loop
dir1="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/closest_other_3mer/"
dir2="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/closest_other_3mer/closest_2nd_other_3mer/"
motif="XXXXXX"
for (i in 1:nrow(df)){
pattern1=df[i,1]
pattern2=df[i,2]
# anchor position: closest +/- 3mer, anchor at the first letter base
print(pattern1)
closest_plus_3mer_to_GATA3_peak_summits=fiveprime.bed(read.table(paste0(dir1, "closest.1st.plus.", pattern1, ".to.GATA3.with.", motif, ".bed"), header=FALSE)[,4:11], upstreamWindow = 0, downstreamWindow = 0)
# query 3mer coordinates on genome (without the overlapped closest 3mer coordinates) - anchor at the first letter base
print(pattern2)
plus.3mer.file=fiveprime.bed(read.table(file = paste0(dir2, "hg38.3.3.3plus.", pattern2, "_without_1st_plus_", pattern2, "_to_GATA3_with_", motif,".bed"), sep="\t", header=FALSE), upstreamWindow = 0, downstreamWindow = 0)
# 2nd closest plus 3mer to closest plus 3mer
closest.2nd.plus.3mer.to.1st.plus.3mer.GATA3.with.motif=bedTools.closest.mod(bed1 = closest_plus_3mer_to_GATA3_peak_summits[,1:3], bed2 = plus.3mer.file, opt.string = '-d -t first')
write.table(closest.2nd.plus.3mer.to.1st.plus.3mer.GATA3.with.motif, file= paste0("closest.2nd.plus.", pattern2, ".to.1st.plus.", pattern1, ".GATA3.with.", motif, ".bed"), quote=F,sep="\t",col.names=F,row.names=F)
}
runR.sh
#!/bin/bash
#SBATCH --job-name=runR_XXXXXX.sh # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 8
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=64G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o runR_XXXXXX.sh_%j.out
#SBATCH -e runR_XXXXXX.sh_%j.err
hostname
name=XXXXXX
mkdir GATA3_peak_with_${name}
cd GATA3_peak_with_${name}
module load R/4.1.2
Rscript ../240225_closestBed_${name}.R
parallel running:
r_file=240225_closestBed.R
sh_file=runR.sh
my_motifs=("motif_1" "motif_2" "motif_4" "motif_5" "motif_6")
for nm in "${my_motifs[@]}"
do
echo $nm
sed -e "s/XXXXXX/${nm}/g" "$r_file" > 240225_closestBed_${nm}.R
done
for nm in "${my_motifs[@]}"
do
echo $nm
sed -e "s/XXXXXX/${nm}/g" "$sh_file" > runR_${nm}.sh
sbatch runR_${nm}.sh
sleep 1
done
240225_closestBed_DHS.R
#!/usr/bin/env Rscript
Args=commandArgs(TRUE)
bedTools.closest.mod <- function(functionstring="/home/FCAM/ssun/packages/bedtools2/bin/closestBed",bed1,bed2,opt.string="") {
options(scipen =99) # not use scientific notation when writing out
#write bed formatted data.frames to tempfile
write.table(bed1,file= 'a.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
write.table(bed2,file= 'b.file.sorted.bed', quote=F,sep="\t",col.names=F,row.names=F)
# create the command string and call the command using system()
# the command sort a and b file by coordinates
command1=paste('sort -k1,1 -k2,2n', 'a.file.bed', '> a.file.sorted.bed')
cat(command1,"\n") #sort -k1,1 -k2,2n a.file.bed > a.file.sorted.bed
try(system(command1))
#command2=paste('sort -k1,1 -k2,2n', 'b.file.bed', '> b.file.sorted.bed')
#cat(command2,"\n")
#try(system(command2))
# the command call closestBed on bed1 and bed2
command=paste(functionstring, opt.string,"-a",'a.file.sorted.bed',"-b",'b.file.sorted.bed',">",'out.file.bed',sep=" ")
cat(command,"\n")
try(system(command))
res=read.table('out.file.bed',header=F, comment.char='')
# remove intermediate files
command3=paste('rm', 'a.file.bed', 'a.file.sorted.bed', 'b.file.sorted.bed', 'out.file.bed')
cat(command3,"\n")
try(system(command3))
colnames(res) = c(colnames(bed1), colnames(bed2), 'dis' )
return(res)
}
library(bigWig)
# List of prioritized triplets
prioritized_triplets <- c("AAA", "TAA", "ATA", "TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")
# List to store non-redundant 6mers
all_6mers <- list()
for (i in 1:length(prioritized_triplets)) {
for (j in 1:length(prioritized_triplets)) {
pair <- c(prioritized_triplets[i], prioritized_triplets[j])
# Combine triplets to form a 6mer
sixmer <- paste(pair, collapse = "")
all_6mers <- c(all_6mers, list(sixmer))
}
}
# Create data frame with first 3 bases and last 3 bases
first_3_bases <- substr(all_6mers, 1, 3)
last_3_bases <- substr(all_6mers, 4, 6)
df <- data.frame(First_3_bases = first_3_bases, Last_3_bases = last_3_bases)
# nested loop
dir1="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/closest_other_3mer/"
dir2="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/closest_other_3mer/closest_2nd_other_3mer/"
# DHS regions
for (i in 1:nrow(df)){
pattern1=df[i,1]
pattern2=df[i,2]
# anchor position: closest +/- pattern1 relative to G/C
print(pattern1)
closest_plus_3mer_to_DHS=fiveprime.bed(read.table(paste0(dir1, "closest.1st.plus.", pattern1, ".to.indep.DHS.control.consensus.bed"), header=FALSE)[,4:11], upstreamWindow = 0, downstreamWindow = 0)
# query 3mer coordinates on genome (without the overlapped closest 3mer coordinates) - relative to G/C
print(pattern2)
plus.3mer.file=fiveprime.bed(read.table(file = paste0(dir2, "hg38.3.3.3plus.36_", pattern2, "_without_1st_plus_", pattern2, "_to_indep_DHS_control.bed"), sep="\t", header=FALSE), upstreamWindow = 0, downstreamWindow = 0)
# 2nd closest plus 3mer to closest plus 3mer
closest.2nd.plus.3mer.to.1st.plus.3mer.DHS=bedTools.closest.mod(bed1 = closest_plus_3mer_to_DHS[,1:3], bed2 = plus.3mer.file, opt.string = '-d -t first')
write.table(closest.2nd.plus.3mer.to.1st.plus.3mer.DHS, file= paste0("closest.2nd.plus.", pattern2, ".to.1st.plus.", pattern1, ".indep.DHS.control.bed"), quote=F,sep="\t",col.names=F,row.names=F)
}
runR_DHS.sh
#!/bin/bash
#SBATCH --job-name=runR_DHS.sh # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 8
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=200G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o runR_DHS.sh_%j.out
#SBATCH -e runR_DHS.sh_%j.err
hostname
mkdir DHS
cd DHS
module load R/4.1.2
Rscript ../240225_closestBed_DHS.R
coherence check:
check some coordinates on UCSC genome browser, making sure that we are anchoring at the correct G/C.
For some 3mers, we are empirically determine a relative G/C considering the GATA3 PWMs. So it is normal to see that some of the anchorred positions are not exactly an G or an C.
Make minimal plots to communicate results.
For each peak set, (5 positive GATA3 peak sets and one DHS independent regions), there are 196 unique 3mer pairs with different relative distances.
Since these 3mer pairs are prioritized based on their potential enrichment in GATA3 peak sets, they might convey similar enrichment information. Thus we could plot bw plots at different distances, showing which distance they all prefer.
A density plot will have 105 traces for each peak set.
Or anchor at 1st closest GAT, for instance, then plot traces of 2nd closest 3mer relative to this GAT
ls *to.1st.plus.GAT.GATA3.with.motif_1.bed
wc -l *to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.AAA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.AAT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.AGA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.ATA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.ATT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.CTA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TAA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TAG.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TAT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TCT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TTA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TTT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## 12470 closest.2nd.plus.AAA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## 12470 closest.2nd.plus.AAT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## 12470 closest.2nd.plus.AGA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## 12470 closest.2nd.plus.ATA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## 12470 closest.2nd.plus.ATC.to.1st.plus.GAT.GATA3.with.motif_1.bed
## 12470 closest.2nd.plus.ATT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## 12470 closest.2nd.plus.CTA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## 12470 closest.2nd.plus.GAT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## 12470 closest.2nd.plus.TAA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## 12470 closest.2nd.plus.TAG.to.1st.plus.GAT.GATA3.with.motif_1.bed
## 12470 closest.2nd.plus.TAT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## 12470 closest.2nd.plus.TCT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## 12470 closest.2nd.plus.TTA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## 12470 closest.2nd.plus.TTT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## 174580 total
Prepare data:
calculate_actual_frequency <- function(data) {
# Use table() to create a frequency table
actual_frequencies <- table(data)/length(data)
result <- data.frame(value = names(actual_frequencies), actual_freq = as.vector(actual_frequencies))
return(result)
}
#my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
my_motifs = c("motif_1")
Anchor_triplets = c("GAT")
Query_triplets = c("AAA", "TAA" ,"ATA" ,"TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")
for (motif in my_motifs){
print(motif)
df.all=data.frame(matrix(nrow = 0, ncol = 6))
for (Anchor_triplet in Anchor_triplets){
print(Anchor_triplet)
df.peak.dis.all=data.frame(matrix(nrow = 0, ncol = 6))
for (Query_triplet in Query_triplets){
print(Query_triplet)
df.peak.dis = data.frame(matrix(nrow = 0, ncol = 6))
colnames(df.peak.dis) = c("dis","anchor_3mer", "query_3mer", "status", "abs.dis", "actual_freq")
for (closest_2nd_dis in Sys.glob(file.path(paste0("./closest.2nd.plus.", Query_triplet, ".to.1st.plus.", Anchor_triplet, ".GATA3.with.", motif, ".bed")))) {
print(closest_2nd_dis)
temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], Anchor_triplet, Query_triplet, motif))
colnames(temp) = c("dis", "anchor_3mer", "query_3mer", "status")
temp$dis=as.integer(temp$dis)
temp$abs.dis=abs(temp$dis)
actual_frequencies = calculate_actual_frequency(temp$abs.dis)
temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
df.peak.dis = rbind(df.peak.dis,temp1)
}
df.peak.dis.all=rbind(df.peak.dis.all, df.peak.dis)
}
df.all=rbind(df.all, df.peak.dis.all)
}
}
## [1] "motif_1"
## [1] "GAT"
## [1] "AAA"
## [1] "./closest.2nd.plus.AAA.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "TAA"
## [1] "./closest.2nd.plus.TAA.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "ATA"
## [1] "./closest.2nd.plus.ATA.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "TTA"
## [1] "./closest.2nd.plus.TTA.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "AAT"
## [1] "./closest.2nd.plus.AAT.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "TAT"
## [1] "./closest.2nd.plus.TAT.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "GAT"
## [1] "./closest.2nd.plus.GAT.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "ATT"
## [1] "./closest.2nd.plus.ATT.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "TTT"
## [1] "./closest.2nd.plus.TTT.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "ATC"
## [1] "./closest.2nd.plus.ATC.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "AGA"
## [1] "./closest.2nd.plus.AGA.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "TCT"
## [1] "./closest.2nd.plus.TCT.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "TAG"
## [1] "./closest.2nd.plus.TAG.to.1st.plus.GAT.GATA3.with.motif_1.bed"
## [1] "CTA"
## [1] "./closest.2nd.plus.CTA.to.1st.plus.GAT.GATA3.with.motif_1.bed"
str(df.all)
## 'data.frame': 174580 obs. of 6 variables:
## $ abs.dis : int 3 3 3 3 3 3 3 3 3 3 ...
## $ dis : int 3 3 3 3 3 3 3 3 3 3 ...
## $ anchor_3mer: chr "GAT" "GAT" "GAT" "GAT" ...
## $ query_3mer : chr "AAA" "AAA" "AAA" "AAA" ...
## $ status : chr "motif_1" "motif_1" "motif_1" "motif_1" ...
## $ actual_freq: num 0.0573 0.0573 0.0573 0.0573 0.0573 ...
unique(df.all$anchor_3mer)
## [1] "GAT"
unique(df.all$query_3mer)
## [1] "AAA" "TAA" "ATA" "TTA" "AAT" "TAT" "GAT" "ATT" "TTT" "ATC" "AGA" "TCT"
## [13] "TAG" "CTA"
nrow(df.all)
## [1] 174580
head(df.all)
## abs.dis dis anchor_3mer query_3mer status actual_freq
## 1 3 3 GAT AAA motif_1 0.05725742
## 2 3 3 GAT AAA motif_1 0.05725742
## 3 3 3 GAT AAA motif_1 0.05725742
## 4 3 3 GAT AAA motif_1 0.05725742
## 5 3 3 GAT AAA motif_1 0.05725742
## 6 3 3 GAT AAA motif_1 0.05725742
df.all.unique=df.all[!duplicated(df.all), ]
df.all.unique$pattern=paste0(df.all.unique$anchor_3mer, "-", df.all.unique$query_3mer)
unique(df.all.unique$pattern)
## [1] "GAT-AAA" "GAT-TAA" "GAT-ATA" "GAT-TTA" "GAT-AAT" "GAT-TAT" "GAT-GAT"
## [8] "GAT-ATT" "GAT-TTT" "GAT-ATC" "GAT-AGA" "GAT-TCT" "GAT-TAG" "GAT-CTA"
df.all.unique$pattern = factor(df.all.unique$pattern, levels = c("GAT-AAA" ,"GAT-TAA", "GAT-ATA", "GAT-TTA", "GAT-AAT", "GAT-TAT", "GAT-GAT", "GAT-ATT", "GAT-TTT", "GAT-ATC", "GAT-AGA", "GAT-TCT", "GAT-TAG", "GAT-CTA"))
library(lattice)
library(latticeExtra)
#pdf(paste0('xy_closest_2nd_GAT_to_closest_1st_GAT_to_GATA3_pos_', motif, '_compare_to_DHS.pdf'), width=15,height=5)
#print(
xyplot(actual_freq ~ abs.dis,
data = df.all.unique,
groups = pattern,
auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
aspect = 1,
xlim=c(0,50),
ylim=c(0, 0.3),
#type = c('p', 'smooth'),
xlab = "relative distance between two triplets (bp)",
ylab="Frequency of Enrichment",
main="GATA3 peak with motif1",
between=list(y=1.0),
scales = list(x = list(rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.line = list(col=c("blue", "red", "green", "cyan", "magenta", "yellow",
"orange", "purple", "pink", "darkgreen", "purple4",
"brown", "slategray", "darkolivegreen"), lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 15, 1)), col = "grey90")
panel.xyplot(x, y,
#col=c(colorRampPalette(c("red","blue"))(14)),
col=c("blue", "red", "green", "cyan", "magenta", "yellow",
"orange", "purple", "pink", "darkgreen", "purple4",
"brown", "slategray", "darkolivegreen"),
pch=18,
cex=0.6,...)
})
#)
#dev.off()
Compare to DHS:
ls *to.1st.plus.GAT.indep.DHS.control.bed
wc -l *to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.AAA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.AAT.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.AGA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.ATA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.ATT.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.CTA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TAA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TAG.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TAT.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TCT.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TTA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TTT.to.1st.plus.GAT.indep.DHS.control.bed
## 57906 closest.2nd.plus.AAA.to.1st.plus.GAT.indep.DHS.control.bed
## 57906 closest.2nd.plus.AAT.to.1st.plus.GAT.indep.DHS.control.bed
## 57906 closest.2nd.plus.AGA.to.1st.plus.GAT.indep.DHS.control.bed
## 57906 closest.2nd.plus.ATA.to.1st.plus.GAT.indep.DHS.control.bed
## 57906 closest.2nd.plus.ATC.to.1st.plus.GAT.indep.DHS.control.bed
## 57906 closest.2nd.plus.ATT.to.1st.plus.GAT.indep.DHS.control.bed
## 57906 closest.2nd.plus.CTA.to.1st.plus.GAT.indep.DHS.control.bed
## 57906 closest.2nd.plus.GAT.to.1st.plus.GAT.indep.DHS.control.bed
## 57906 closest.2nd.plus.TAA.to.1st.plus.GAT.indep.DHS.control.bed
## 57906 closest.2nd.plus.TAG.to.1st.plus.GAT.indep.DHS.control.bed
## 57906 closest.2nd.plus.TAT.to.1st.plus.GAT.indep.DHS.control.bed
## 57906 closest.2nd.plus.TCT.to.1st.plus.GAT.indep.DHS.control.bed
## 57906 closest.2nd.plus.TTA.to.1st.plus.GAT.indep.DHS.control.bed
## 57906 closest.2nd.plus.TTT.to.1st.plus.GAT.indep.DHS.control.bed
## 810684 total
DHS:
Anchor_triplets = c("GAT")
Query_triplets = c("AAA", "TAA" ,"ATA" ,"TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")
df.all.DHS=data.frame(matrix(nrow = 0, ncol = 6))
for (Anchor_triplet in Anchor_triplets){
print(Anchor_triplet)
df.peak.dis.all.DHS=data.frame(matrix(nrow = 0, ncol = 6))
for (Query_triplet in Query_triplets){
print(Query_triplet)
df.DHS.dis = data.frame(matrix(nrow = 0, ncol = 6))
colnames(df.DHS.dis) = c("dis","anchor_3mer", "query_3mer", "status", "abs.dis", "actual_freq")
for (closest_2nd_dis in Sys.glob(file.path(paste0("./closest.2nd.plus.", Query_triplet, ".to.1st.plus.", Anchor_triplet, ".indep.DHS.control.bed")))) {
print(closest_2nd_dis)
temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], Anchor_triplet, Query_triplet, "indep.DHS.control"))
colnames(temp) = c("dis", "anchor_3mer", "query_3mer", "status")
temp$dis=as.integer(temp$dis)
temp$abs.dis=abs(temp$dis)
actual_frequencies = calculate_actual_frequency(temp$abs.dis)
temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
df.DHS.dis = rbind(df.DHS.dis, temp1)
}
df.peak.dis.all.DHS=rbind(df.peak.dis.all.DHS, df.DHS.dis)
}
df.all.DHS=rbind(df.all.DHS, df.peak.dis.all.DHS)
}
str(df.all.DHS)
unique(df.all.DHS$anchor_3mer)
unique(df.all.DHS$query_3mer)
nrow(df.all.DHS)
head(df.all.DHS)
df.all.DHS.unique=df.all.DHS[!duplicated(df.all.DHS), ]
df.all.DHS.unique$pattern=paste0(df.all.DHS.unique$anchor_3mer, "-", df.all.DHS.unique$query_3mer)
unique(df.all.DHS.unique$pattern)
df.all.DHS.unique$pattern = factor(df.all.DHS.unique$pattern, levels = c("GAT-AAA" ,"GAT-TAA", "GAT-ATA", "GAT-TTA", "GAT-AAT", "GAT-TAT", "GAT-GAT", "GAT-ATT", "GAT-TTT", "GAT-ATC", "GAT-AGA", "GAT-TCT", "GAT-TAG", "GAT-CTA"))
df.plot=rbind(df.all.unique, df.all.DHS.unique)
df.plot$status= factor(df.plot$status, levels=c("motif_1", "indep.DHS.control"))
library(lattice)
library(latticeExtra)
pdf(paste0('xy_closest_2nd_XXX_to_closest_1st_GAT_to_GATA3_pos_motif1_compare_to_DHS.pdf'), width=10,height=10)
print(
xyplot(actual_freq ~ abs.dis | pattern,
data = df.plot,
groups = status,
auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
aspect = 1,
xlim=c(0,50),
ylim=c(0, 0.3),
layout=c(4,4),
#type = c('p', 'smooth'),
xlab = "relative distance between two triplets (bp)",
ylab="Frequency of Enrichment",
main="GATA3 peak with motif1 (red) vs. DHS regions (black)",
between=list(x=1.0, y=1.0),
scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.line = list(col=c("red", "black"), lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 15, 1)), col = "grey90")
panel.xyplot(x, y,
#col=c(colorRampPalette(c("red","blue"))(14)),
col=c("red", "black"),
pch=18,
cex=0.6,...)
})
)
dev.off()
calculate_actual_frequency <- function(data) {
# Use table() to create a frequency table
actual_frequencies <- table(data)/length(data)
result <- data.frame(value = names(actual_frequencies), actual_freq = as.vector(actual_frequencies))
return(result)
}
my_motifs = c("motif_4","motif_2", "motif_6", "motif_5")
Anchor_triplets = c("GAT")
Query_triplets = c("AAA", "TAA" ,"ATA" ,"TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")
df.all=data.frame(matrix(nrow = 0, ncol = 6))
for (motif in my_motifs){
print(motif)
for (Anchor_triplet in Anchor_triplets){
print(Anchor_triplet)
df.peak.dis.all=data.frame(matrix(nrow = 0, ncol = 6))
for (Query_triplet in Query_triplets){
print(Query_triplet)
df.peak.dis = data.frame(matrix(nrow = 0, ncol = 6))
colnames(df.peak.dis) = c("dis","anchor_3mer", "query_3mer", "status", "abs.dis", "actual_freq")
for (closest_2nd_dis in Sys.glob(file.path(paste0("./closest.2nd.plus.", Query_triplet, ".to.1st.plus.", Anchor_triplet, ".GATA3.with.", motif, ".bed")))) {
print(closest_2nd_dis)
temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], Anchor_triplet, Query_triplet, motif))
colnames(temp) = c("dis", "anchor_3mer", "query_3mer", "status")
temp$dis=as.integer(temp$dis)
temp$abs.dis=abs(temp$dis)
actual_frequencies = calculate_actual_frequency(temp$abs.dis)
temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
df.peak.dis = rbind(df.peak.dis, temp1)
}
df.peak.dis.all=rbind(df.peak.dis.all, df.peak.dis)
}
}
df.all=rbind(df.all, df.peak.dis.all)
}
str(df.all)
unique(df.all$anchor_3mer)
unique(df.all$query_3mer)
unique(df.all$status)
nrow(df.all)
head(df.all)
df.all.unique=df.all[!duplicated(df.all), ]
df.all.unique$pattern=paste0(df.all.unique$anchor_3mer, "-", df.all.unique$query_3mer)
unique(df.all.unique$pattern)
df.all.unique$pattern = factor(df.all.unique$pattern, levels = c("GAT-AAA" ,"GAT-TAA", "GAT-ATA", "GAT-TTA", "GAT-AAT", "GAT-TAT", "GAT-GAT", "GAT-ATT", "GAT-TTT", "GAT-ATC", "GAT-AGA", "GAT-TCT", "GAT-TAG", "GAT-CTA"))
unique(df.all.unique$status)
df.all.unique$status = factor(df.all.unique$status, levels = c("motif_4","motif_2", "motif_6", "motif_5"))
my_motifs = c("motif_4","motif_2", "motif_6", "motif_5")
for (motif in my_motifs){
print(motif)
df.plot=rbind(df.all.unique[df.all.unique$status==motif, ], df.all.DHS.unique)
df.plot$status= factor(df.plot$status, levels=c(motif, "indep.DHS.control"))
library(lattice)
library(latticeExtra)
pdf(paste0('xy_closest_2nd_XXX_to_closest_1st_GAT_to_GATA3_pos_', motif, '_compare_to_DHS.pdf'), width=10,height=10)
print(
xyplot(actual_freq ~ abs.dis | pattern,
data = df.plot,
groups = status,
auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
aspect = 1,
xlim=c(0,50),
ylim=c(0, 0.3),
layout=c(4,4),
#type = c('p', 'smooth'),
xlab = "relative distance between two triplets (bp)",
ylab="Frequency of Enrichment",
main=paste0("GATA3 peak with ", motif, " (red) vs. DHS regions (black)"),
between=list(x=1.0, y=1.0),
scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.line = list(col=c("red", "black"), lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 15, 1)), col = "grey90")
panel.xyplot(x, y,
#col=c(colorRampPalette(c("red","blue"))(14)),
col=c("red", "black"),
pch=18,
cex=0.6,...)
})
)
dev.off()
}
Prepare data:
calculate_actual_frequency <- function(data) {
# Use table() to create a frequency table
actual_frequencies <- table(data)/length(data)
result <- data.frame(value = names(actual_frequencies), actual_freq = as.vector(actual_frequencies))
return(result)
}
#my_motifs = c("motif_1", "motif_4","motif_2", "motif_6", "motif_5")
my_motifs = c("motif_5")
Anchor_triplets = c("ATC")
Query_triplets = c("AAA", "TAA" ,"ATA" ,"TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")
for (motif in my_motifs){
print(motif)
df.all=data.frame(matrix(nrow = 0, ncol = 6))
for (Anchor_triplet in Anchor_triplets){
print(Anchor_triplet)
df.peak.dis.all=data.frame(matrix(nrow = 0, ncol = 6))
for (Query_triplet in Query_triplets){
print(Query_triplet)
df.peak.dis = data.frame(matrix(nrow = 0, ncol = 6))
colnames(df.peak.dis) = c("dis","anchor_3mer", "query_3mer", "status", "abs.dis", "actual_freq")
for (closest_2nd_dis in Sys.glob(file.path(paste0("./closest.2nd.plus.", Query_triplet, ".to.1st.plus.", Anchor_triplet, ".GATA3.with.", motif, ".bed")))) {
print(closest_2nd_dis)
temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], Anchor_triplet, Query_triplet, motif))
colnames(temp) = c("dis", "anchor_3mer", "query_3mer", "status")
temp$dis=as.integer(temp$dis)
temp$abs.dis=abs(temp$dis)
actual_frequencies = calculate_actual_frequency(temp$abs.dis)
temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
df.peak.dis = rbind(df.peak.dis,temp1)
}
df.peak.dis.all=rbind(df.peak.dis.all, df.peak.dis)
}
df.all=rbind(df.all, df.peak.dis.all)
}
}
str(df.all)
unique(df.all$anchor_3mer)
unique(df.all$query_3mer)
nrow(df.all)
head(df.all)
df.all.unique=df.all[!duplicated(df.all), ]
df.all.unique$pattern=paste0(df.all.unique$anchor_3mer, "-", df.all.unique$query_3mer)
unique(df.all.unique$pattern)
df.all.unique$pattern = as.factor(df.all.unique$pattern)
DHS:
Anchor_triplets = c("ATC")
Query_triplets = c("AAA", "TAA" ,"ATA" ,"TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")
df.all.DHS=data.frame(matrix(nrow = 0, ncol = 6))
for (Anchor_triplet in Anchor_triplets){
print(Anchor_triplet)
df.peak.dis.all.DHS=data.frame(matrix(nrow = 0, ncol = 6))
for (Query_triplet in Query_triplets){
print(Query_triplet)
df.DHS.dis = data.frame(matrix(nrow = 0, ncol = 6))
colnames(df.DHS.dis) = c("dis","anchor_3mer", "query_3mer", "status", "abs.dis", "actual_freq")
for (closest_2nd_dis in Sys.glob(file.path(paste0("../DHS/closest.2nd.plus.", Query_triplet, ".to.1st.plus.", Anchor_triplet, ".indep.DHS.control.bed")))) {
print(closest_2nd_dis)
temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], Anchor_triplet, Query_triplet, "indep.DHS.control"))
colnames(temp) = c("dis", "anchor_3mer", "query_3mer", "status")
temp$dis=as.integer(temp$dis)
temp$abs.dis=abs(temp$dis)
actual_frequencies = calculate_actual_frequency(temp$abs.dis)
temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
df.DHS.dis = rbind(df.DHS.dis, temp1)
}
df.peak.dis.all.DHS=rbind(df.peak.dis.all.DHS, df.DHS.dis)
}
df.all.DHS=rbind(df.all.DHS, df.peak.dis.all.DHS)
}
str(df.all.DHS)
unique(df.all.DHS$anchor_3mer)
unique(df.all.DHS$query_3mer)
nrow(df.all.DHS)
head(df.all.DHS)
df.all.DHS.unique=df.all.DHS[!duplicated(df.all.DHS), ]
df.all.DHS.unique$pattern=paste0(df.all.DHS.unique$anchor_3mer, "-", df.all.DHS.unique$query_3mer)
unique(df.all.DHS.unique$pattern)
df.all.DHS.unique$pattern = as.factor(df.all.DHS.unique$pattern)
df.plot=rbind(df.all.unique, df.all.DHS.unique)
df.plot$status= factor(df.plot$status, levels=c(motif, "indep.DHS.control"))
library(lattice)
library(latticeExtra)
pdf(paste0('xy_closest_2nd_XXX_to_closest_1st_ATC_to_GATA3_pos_motif_5_compare_to_DHS.pdf'), width=10,height=10)
print(
xyplot(actual_freq ~ abs.dis | pattern,
data = df.plot,
groups = status,
auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
aspect = 1,
xlim=c(0,50),
ylim=c(0, 0.3),
layout=c(4,4),
#type = c('p', 'smooth'),
xlab = "relative distance between two triplets (bp)",
ylab="Frequency of Enrichment",
main=paste0("GATA3 peak with ", motif, " (red) vs. DHS regions (black)"),
between=list(x=1.0, y=1.0),
scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.line = list(col=c("red", "black"), lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 15, 1)), col = "grey90")
panel.xyplot(x, y,
#col=c(colorRampPalette(c("red","blue"))(14)),
col=c("red", "black"),
pch=18,
cex=0.6,...)
})
)
dev.off()
ls *.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
wc -l *.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.AAA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.AAT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.AGA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.ATA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.ATT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.CTA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TAA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TAG.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TAT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TCT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TTA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TTT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## 37308 closest.2nd.plus.AAA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## 37308 closest.2nd.plus.AAT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## 37308 closest.2nd.plus.AGA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## 37308 closest.2nd.plus.ATA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## 37308 closest.2nd.plus.ATC.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## 37308 closest.2nd.plus.ATT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## 37308 closest.2nd.plus.CTA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## 37308 closest.2nd.plus.GAT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## 37308 closest.2nd.plus.TAA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## 37308 closest.2nd.plus.TAG.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## 37308 closest.2nd.plus.TAT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## 37308 closest.2nd.plus.TCT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## 37308 closest.2nd.plus.TTA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## 37308 closest.2nd.plus.TTT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## 522312 total
calculate_actual_frequency <- function(data) {
# Use table() to create a frequency table
actual_frequencies <- table(data)/length(data)
result <- data.frame(value = names(actual_frequencies), actual_freq = as.vector(actual_frequencies))
return(result)
}
Anchor_triplets = c("GAT")
#Anchor_triplets = c("ATC")
Query_triplets = c("AAA", "TAA" ,"ATA" ,"TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")
df.all=data.frame(matrix(nrow = 0, ncol = 6))
for (Anchor_triplet in Anchor_triplets){
print(Anchor_triplet)
df.peak.dis.all=data.frame(matrix(nrow = 0, ncol = 6))
for (Query_triplet in Query_triplets){
print(Query_triplet)
df.peak.dis = data.frame(matrix(nrow = 0, ncol = 6))
colnames(df.peak.dis) = c("dis","anchor_3mer", "query_3mer", "status", "abs.dis", "actual_freq")
for (closest_2nd_dis in Sys.glob(file.path(paste0("./closest.2nd.plus.", Query_triplet, ".to.1st.plus.", Anchor_triplet, ".to.GATA3_without_motifs_123456_78.bed")))) {
print(closest_2nd_dis)
temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], Anchor_triplet, Query_triplet, "peak_without_motif"))
colnames(temp) = c("dis", "anchor_3mer", "query_3mer", "status")
temp$dis=as.integer(temp$dis)
temp$abs.dis=abs(temp$dis)
actual_frequencies = calculate_actual_frequency(temp$abs.dis)
temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
df.peak.dis = rbind(df.peak.dis, temp1)
}
df.peak.dis.all=rbind(df.peak.dis.all, df.peak.dis)
}
df.all=rbind(df.all, df.peak.dis.all)
}
str(df.all)
unique(df.all$anchor_3mer)
unique(df.all$query_3mer)
unique(df.all$status)
nrow(df.all)
head(df.all)
df.all.unique=df.all[!duplicated(df.all), ]
df.all.unique$pattern=paste0(df.all.unique$anchor_3mer, "-", df.all.unique$query_3mer)
unique(df.all.unique$pattern)
## [1] "GAT-AAA" "GAT-TAA" "GAT-ATA" "GAT-TTA" "GAT-AAT" "GAT-TAT" "GAT-GAT"
## [8] "GAT-ATT" "GAT-TTT" "GAT-ATC" "GAT-AGA" "GAT-TCT" "GAT-TAG" "GAT-CTA"
df.all.unique$pattern = factor(df.all.unique$pattern, levels = c("GAT-AAA" ,"GAT-TAA", "GAT-ATA", "GAT-TTA", "GAT-AAT", "GAT-TAT", "GAT-GAT", "GAT-ATT", "GAT-TTT", "GAT-ATC", "GAT-AGA", "GAT-TCT", "GAT-TAG", "GAT-CTA"))
unique(df.all.unique$status)
## [1] "motif_1"
df.plot=rbind(df.all.unique, df.all.DHS.unique)
df.plot$status= factor(df.plot$status, levels=c("peak_without_motif", "indep.DHS.control"))
library(lattice)
library(latticeExtra)
pdf(paste0('xy_closest_2nd_XXX_to_closest_1st_GAT_to_GATA3_without_motifs_compare_to_DHS.pdf'), width=10,height=10)
print(
xyplot(actual_freq ~ abs.dis | pattern,
data = df.plot,
groups = status,
auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
aspect = 1,
xlim=c(0,50),
ylim=c(0, 0.3),
layout=c(4,4),
#type = c('p', 'smooth'),
xlab = "relative distance between two triplets (bp)",
ylab="Frequency of Enrichment",
main="GATA3 peak without motifs (red) vs. DHS regions (black)",
between=list(x=1.0, y=1.0),
scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.line = list(col=c("red", "black"), lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 15, 1)), col = "grey90")
panel.xyplot(x, y,
#col=c(colorRampPalette(c("red","blue"))(14)),
col=c("red", "black"),
pch=18,
cex=0.6,...)
})
)
dev.off()
A heatmap with 14 3mer as x and y axies, and color code the density at different distances.
A heatmap, y axies is all peaks ranked by intensity, then plot the 1st 3mer and 2nd 3mer coordinates for each peak.
“heatmap_test.txt” stores a subset of data with distance info from closest +GAT for peaks with motif1 summit, and distance info from 2nd to 1st closest GAT for peaks with motif1.
df <- read.table("heatmap_test.txt", header=T)[,c(1,3)]
create_matrix=function(df, dis.bound){
df.in.bound=df[!(rowSums(df > dis.bound | df < -dis.bound) > 0), ]
nrows=nrow(df.in.bound)
df.in.bound=df.in.bound[order(rank(-abs(df.in.bound[,1]))), ]
matrix=matrix(0, nrow = nrows, ncol =(2*dis.bound+1))
for (i in 1:nrow(df.in.bound)){
distance1=df.in.bound[i, 1]
distance2=df.in.bound[i, 2]
index1=distance1+dis.bound
index2=distance2+dis.bound
matrix[i, index1]=1
matrix[i, index2]=0.6
}
return(matrix)
}
mat=create_matrix(df, 100)
mat_df <- as.data.frame(as.table(mat))
names(mat_df) <- c( "Peak", "Distance", "Value")
# Plot
#pdf("test_heatmap_spaced_GAT_distance_to_peak_summit.pdf")
#print(
levelplot(Value ~ Distance * Peak, data = mat_df,
col.regions = colorRampPalette(c( "white","blue", "red")),
aspect=2,
at = seq(0, 1, length=150),
axes = FALSE,
sub="",
colorkey = FALSE,
region = TRUE,
scales = list(draw = FALSE),
xlab = "Distance to peak summit", ylab = "GATA3 Peaks with motif1", main = "closest GAT (red) and 2nd closest GAT (blue) Distances to peak summit",
newpage = FALSE,
panel = function(...) {panel.levelplot(...)
panel.abline(v=100, col = "black")}
)
#)
#dev.off()
RSAT:
Confirm how the software counts the observed spaced 3mer. Will they count twice if seeing 2 spaced 3mer at different locus on one sequence? – yes, see the coherence check section under RSAT.
Combine results regardless of upstream or downstream orientation.
without_motifs_123456_78_161bp_mast.bed
37308 peaks.
step1: find the 3mer coordinates (on hg38 genome) that are closest to each peak summit with closestBed.
step2: remove the closest 3mer coordinates from the whole genome 3mer coordinates on the same strand with bedtools subtract.
step3: find the second closest 3mer relative to the closest one.
My list of 3mer: GAT, ATC.
240218_closestBed.R:
(cd /home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/GAT_ATC_240218)
#!/usr/bin/env Rscript
Args=commandArgs(TRUE)
# closestBed function
bedTools.closest <- function(functionstring="/home/FCAM/ssun/packages/bedtools2/bin/closestBed",bed1,bed2,opt.string="") {
options(scipen =99) # not use scientific notation when writing out
#write bed formatted data.frames to tempfile
write.table(bed1,file= 'a.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
write.table(bed2,file= 'b.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
# create the command string and call the command using system()
# the command sort a and b file by coordinates
command1=paste('sort -k1,1 -k2,2n', 'a.file.bed', '> a.file.sorted.bed')
cat(command1,"\n") #sort -k1,1 -k2,2n a.file.bed > a.file.sorted.bed
try(system(command1))
command2=paste('sort -k1,1 -k2,2n', 'b.file.bed', '> b.file.sorted.bed')
cat(command2,"\n")
try(system(command2))
# the command call closestBed on bed1 and bed2
command=paste(functionstring, opt.string,"-a",'a.file.sorted.bed',"-b",'b.file.sorted.bed',">",'out.file.bed',sep=" ")
cat(command,"\n")
try(system(command))
res=read.table('out.file.bed',header=F, comment.char='')
# remove intermediate files
command3=paste('rm', 'a.file.bed', 'b.file.bed', 'a.file.sorted.bed', 'b.file.sorted.bed', 'out.file.bed')
cat(command3,"\n")
try(system(command3))
colnames(res) = c(colnames(bed1), colnames(bed2), 'dis' )
return(res)
}
dir1="/labs/Guertin/siyu/Sathyan_GATA3_ChIP_pool1_pool2/overrep_3mer/hg38_full_kmer3_rs1000/seqdump/"
dir2="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/"
dir3="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/ENCODE_DHS_GSE29692/"
prioritized_triplets = c("GAT", "ATC")
library(bigWig)
for (triplet in prioritized_triplets){
print(triplet)
# 3mer genome coordinates
plus.triplet.file=read.table(file = Sys.glob(file.path(paste0(dir1,"hg38.3.3.3plus.*_",triplet, ".bed"))), sep="\t", header=FALSE)
minus.triplet.file=read.table(file = Sys.glob(file.path(paste0(dir1,"hg38.3.3.3minus.*_",triplet ,".bed"))), sep="\t", header=FALSE)
# peak summits
GATA3_peak_summits=center.bed(read.table(paste0(dir2, "without_motifs_123456_78_161bp_mast.bed"), header=FALSE), upstreamWindow = 0, downstreamWindow = 0)
# consensus neg
indep.DHS.control.consensus=center.bed(read.table(paste0(dir3, "MCF7DHS_consensus_noGATA_without_motifs_123456_78.bed"), header=FALSE), upstreamWindow = 0, downstreamWindow = 0)
# closestBed--1st closest plus
##
closest.1st.plus.triplet.to.peak=bedTools.closest(bed1 = GATA3_peak_summits[,1:3], bed2 = plus.triplet.file, opt.string = '-d -t first')
write.table(closest.1st.plus.triplet.to.peak,file= paste0('closest.1st.plus.',triplet,'.to.GATA3_without_motifs_123456_78_161bp_mast.bed'), quote=F,sep="\t",col.names=F,row.names=F)
##
closest.1st.plus.triplet.to.indep.DHS.control.consensus=bedTools.closest(bed1 = indep.DHS.control.consensus[,1:3], bed2 = plus.triplet.file, opt.string = '-d -t first')
write.table(closest.1st.plus.triplet.to.indep.DHS.control.consensus,file= paste0('closest.1st.plus.',triplet,'.to.indep.DHS.control.consensus.bed'), quote=F,sep="\t",col.names=F,row.names=F)
# closestBed--1st closest minus
##
closest.1st.minus.triplet.to.peak=bedTools.closest(bed1 = GATA3_peak_summits[,1:3], bed2 =minus.triplet.file, opt.string = '-d -t first')
write.table(closest.1st.minus.triplet.to.peak,file= paste0('closest.1st.minus.',triplet,'.to.GATA3_without_motifs_123456_78_161bp_mast.bed'), quote=F,sep="\t",col.names=F,row.names=F)
##
closest.1st.minus.triplet.to.indep.DHS.control.consensus=bedTools.closest(bed1 = indep.DHS.control.consensus[,1:3], bed2 = minus.triplet.file, opt.string = '-d -t first')
write.table(closest.1st.minus.triplet.to.indep.DHS.control.consensus,file= paste0('closest.1st.minus.',triplet,'.to.indep.DHS.control.consensus.bed'), quote=F,sep="\t",col.names=F,row.names=F)
}
runR.sh
#!/bin/bash
#SBATCH --job-name=runR.sh # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 8
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=200G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o runR.sh_%j.out
#SBATCH -e runR.sh_%j.err
module load R/4.1.2
Rscript 240218_closestBed.R
bedtools subtract.#!/bin/bash
#SBATCH --job-name=remove_1st_3mer.sh # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 16
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=200G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o remove_1st_3mer.sh_%j.out
#SBATCH -e remove_1st_3mer.sh_%j.err
input_dir1=/labs/Guertin/siyu/Sathyan_GATA3_ChIP_pool1_pool2/overrep_3mer/hg38_full_kmer3_rs1000/seqdump/
input_dir2=/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/GAT_ATC_240218/
prioritized_triplets=("GAT" "ATC")
# Use a for loop to iterate over the
module load bedtools
for triplet in "${prioritized_triplets[@]}"
do
echo $triplet
# plus
sort -k1,1 -k2,2n ${input_dir1}hg38.3.3.3plus*${triplet}.bed > hg38.3.3.3plus.${triplet}.sorted.bed
awk '{print $4, $5, $6, $7, $8, $9, $10}' ${input_dir2}closest.1st.plus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.bed | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.plus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.uniq.sorted.bed
bedtools subtract -a hg38.3.3.3plus.${triplet}.sorted.bed -b closest.1st.plus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.uniq.sorted.bed -f 1.00 -s > hg38.3.3.3plus.${triplet}_without_1st_plus_${triplet}_to_GATA3_without_motifs_123456_78_161bp_mast.bed
rm hg38.3.3.3plus.${triplet}.sorted.bed
rm closest.1st.plus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.uniq.sorted.bed
# minus
sort -k1,1 -k2,2n ${input_dir1}hg38.3.3.3minus*${triplet}.bed > hg38.3.3.3minus.${triplet}.sorted.bed
awk '{print $4, $5, $6, $7, $8, $9, $10}' ${input_dir2}closest.1st.minus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.bed | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.minus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.uniq.sorted.bed
bedtools subtract -a hg38.3.3.3minus.${triplet}.sorted.bed -b closest.1st.minus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.uniq.sorted.bed -f 1.00 -s > hg38.3.3.3minus.${triplet}_without_1st_minus_${triplet}_to_GATA3_without_motifs_123456_78_161bp_mast.bed
rm hg38.3.3.3minus.${triplet}.sorted.bed
rm closest.1st.minus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.uniq.sorted.bed
done
# independent DHS control
for triplet in "${prioritized_triplets[@]}"
do
echo $triplet
# plus
sort -k1,1 -k2,2n ${input_dir1}hg38.3.3.3plus*${triplet}.bed > hg38.3.3.3plus.${triplet}.sorted.bed
awk '{print $4, $5, $6, $7, $8, $9, $10}' ${input_dir2}closest.1st.plus.${triplet}.to.indep.DHS.control.consensus.bed | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.plus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed
bedtools subtract -a hg38.3.3.3plus.${triplet}.sorted.bed -b closest.1st.plus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed -f 1.00 -s > hg38.3.3.3plus.36_${triplet}_without_1st_plus_${triplet}_to_indep_DHS_control.bed
rm hg38.3.3.3plus.${triplet}.sorted.bed
rm closest.1st.plus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed
# minus
sort -k1,1 -k2,2n ${input_dir1}hg38.3.3.3minus*${triplet}.bed > hg38.3.3.3minus.${triplet}.sorted.bed
awk '{print $4, $5, $6, $7, $8, $9, $10}' ${input_dir2}closest.1st.minus.${triplet}.to.indep.DHS.control.consensus.bed | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.minus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed
bedtools subtract -a hg38.3.3.3minus.${triplet}.sorted.bed -b closest.1st.minus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed -f 1.00 -s > hg38.3.3.3minus.36_${triplet}_without_1st_minus_${triplet}_to_indep_DHS_control.bed
rm hg38.3.3.3minus.${triplet}.sorted.bed
rm closest.1st.minus.${triplet}.to.indep.DHS.control.consensus.uniq.sorted.bed
done
240218_closestBed2.R
#!/usr/bin/env Rscript
Args=commandArgs(TRUE)
bedTools.closest <- function(functionstring="/home/FCAM/ssun/packages/bedtools2/bin/closestBed",bed1,bed2,opt.string="") {
options(scipen =99) # not use scientific notation when writing out
#write bed formatted data.frames to tempfile
write.table(bed1,file= 'a.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
write.table(bed2,file= 'b.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
# create the command string and call the command using system()
# the command sort a and b file by coordinates
command1=paste('sort -k1,1 -k2,2n', 'a.file.bed', '> a.file.sorted.bed')
cat(command1,"\n") #sort -k1,1 -k2,2n a.file.bed > a.file.sorted.bed
try(system(command1))
command2=paste('sort -k1,1 -k2,2n', 'b.file.bed', '> b.file.sorted.bed')
cat(command2,"\n")
try(system(command2))
# the command call closestBed on bed1 and bed2
command=paste(functionstring, opt.string,"-a",'a.file.sorted.bed',"-b",'b.file.sorted.bed',">",'out.file.bed',sep=" ")
cat(command,"\n")
try(system(command))
res=read.table('out.file.bed',header=F, comment.char='')
# remove intermediate files
command3=paste('rm', 'a.file.bed', 'b.file.bed', 'a.file.sorted.bed', 'b.file.sorted.bed', 'out.file.bed')
cat(command3,"\n")
try(system(command3))
colnames(res) = c(colnames(bed1), colnames(bed2), 'dis' )
return(res)
}
#library(lattice)
#library(latticeExtra)
#library(Biostrings)
library(bigWig)
# List of prioritized triplets
prioritized_triplets <- c("GAT", "ATC")
# List to store non-redundant 6mers
all_6mers <- list()
for (i in 1:length(prioritized_triplets)) {
for (j in 1:length(prioritized_triplets)) {
pair <- c(prioritized_triplets[i], prioritized_triplets[j])
# Combine triplets to form a 6mer
sixmer <- paste(pair, collapse = "")
all_6mers <- c(all_6mers, list(sixmer))
}
}
# Create data frame with first 3 bases and last 3 bases
first_3_bases <- substr(all_6mers, 1, 3)
last_3_bases <- substr(all_6mers, 4, 6)
df <- data.frame(First_3_bases = first_3_bases, Last_3_bases = last_3_bases)
# nested loop
dir="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/GAT_ATC_240218/"
win=read.csv(paste0('/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/closest_other_3mer/closest_2nd_other_3mer/pattern_anchor_at_GorC_for_bigWig_pkg.csv'))
# GATA3 peaks with motif1,2,4,5,6
for (i in 1:nrow(df)){
pattern1=df[i,1]
pattern2=df[i,2]
# anchor position: closest +/- pattern1 relative to G/C
print(pattern1)
closest_plus_3mer_to_GATA3_peak_summits=fiveprime.bed(read.table(paste0(dir, "closest.1st.plus.", pattern1, ".to.GATA3_without_motifs_123456_78_161bp_mast.bed"), header=FALSE)[,4:11], upstreamWindow = win[win$pattern==pattern1, "plus_upstream"] , downstreamWindow = win[win$pattern==pattern1, "plus_downstream"])
# query 3mer coordinates on genome (without the overlapped closest 3mer coordinates) - relative to G/C
print(pattern2)
plus.3mer.file=fiveprime.bed(read.table(file = paste0(dir, "hg38.3.3.3plus.", pattern2, "_without_1st_plus_", pattern2, "_to_GATA3_without_motifs_123456_78_161bp_mast.bed"), sep="\t", header=FALSE), upstreamWindow = win[win$pattern==pattern2, "plus_upstream"] , downstreamWindow = win[win$pattern==pattern2, "plus_downstream"])
# 2nd closest plus 3mer to closest plus 3mer
closest.2nd.plus.3mer.to.1st.plus.3mer.GATA3.without.motif=bedTools.closest(bed1 = closest_plus_3mer_to_GATA3_peak_summits[,1:3], bed2 = plus.3mer.file, opt.string = '-d -t first')
write.table(closest.2nd.plus.3mer.to.1st.plus.3mer.GATA3.without.motif, file= paste0("closest.2nd.plus.", pattern2, ".to.1st.plus.", pattern1, ".GATA3_without_motifs_123456_78_161bp_mast.bed"), quote=F,sep="\t",col.names=F,row.names=F)
}
# DHS regions
for (i in 1:nrow(df)){
pattern1=df[i,1]
pattern2=df[i,2]
# anchor position: closest +/- pattern1 relative to G/C
print(pattern1)
closest_plus_3mer_to_DHS=fiveprime.bed(read.table(paste0(dir, "closest.1st.plus.", pattern1, ".to.indep.DHS.control.consensus.bed"), header=FALSE)[,4:11], upstreamWindow = win[win$pattern==pattern1, "plus_upstream"] , downstreamWindow = win[win$pattern==pattern1, "plus_downstream"])
# query 3mer coordinates on genome (without the overlapped closest 3mer coordinates) - relative to G/C
print(pattern2)
plus.3mer.file=fiveprime.bed(read.table(file = paste0(dir, "hg38.3.3.3plus.36_", pattern2, "_without_1st_plus_", pattern2, "_to_indep_DHS_control.bed"), sep="\t", header=FALSE), upstreamWindow = win[win$pattern==pattern2, "plus_upstream"] , downstreamWindow = win[win$pattern==pattern2, "plus_downstream"])
# 2nd closest plus 3mer to closest plus 3mer
closest.2nd.plus.3mer.to.1st.plus.3mer.DHS=bedTools.closest(bed1 = closest_plus_3mer_to_DHS[,1:3], bed2 = plus.3mer.file, opt.string = '-d -t first')
write.table(closest.2nd.plus.3mer.to.1st.plus.3mer.DHS, file= paste0("closest.2nd.plus.", pattern2, ".to.1st.plus.", pattern1, ".indep.DHS.control.bed"), quote=F,sep="\t",col.names=F,row.names=F)
}
runR.sh
#!/bin/bash
#SBATCH --job-name=runR.sh # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 16
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=200G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o runR.sh_%j.out
#SBATCH -e runR.sh_%j.err
module load R/4.1.2
Rscript 240218_closestBed2.R
coherence check
wc -l closest.2nd.plus.ATC.to.1st.plus.ATC.GATA3_without_motifs_123456_78_161bp_mast.bed
#37308 closest.2nd.plus.ATC.to.1st.plus.ATC.GATA3_without_motifs_123456_78_161bp_mast.bed
wc -l ../without_motifs_123456_78_161bp_mast.bed
#37308 ../without_motifs_123456_78_161bp_mast.bed
Also randomly selected a few coordinates and checked on UCSC genome browser to check the bases.
ls closest.2nd*.bed
## closest.2nd.plus.AAA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.AAA.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.AAA.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.AAA.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.AAA.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.AAA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.AAA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.AAT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.AAT.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.AAT.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.AAT.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.AAT.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.AAT.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.AAT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.AGA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.AGA.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.AGA.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.AGA.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.AGA.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.AGA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.AGA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.ATA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.ATA.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.ATA.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.ATA.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.ATA.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.ATA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.ATA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.ATC.to.1st.plus.ATC.GATA3.without.motifs.quantile1.bed
## closest.2nd.plus.ATC.to.1st.plus.ATC.GATA3_without_motifs_123456_78_161bp_mast.bed
## closest.2nd.plus.ATC.to.1st.plus.ATC.indep.DHS.control.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.GATA3.without.motifs.quantile1.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.GATA3_without_motifs_123456_78_161bp_mast.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.ATC.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.ATT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.ATT.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.ATT.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.ATT.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.ATT.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.ATT.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.ATT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.CTA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.CTA.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.CTA.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.CTA.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.CTA.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.CTA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.CTA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.GAT.to.1st.plus.ATC.GATA3.without.motifs.quantile1.bed
## closest.2nd.plus.GAT.to.1st.plus.ATC.GATA3_without_motifs_123456_78_161bp_mast.bed
## closest.2nd.plus.GAT.to.1st.plus.ATC.indep.DHS.control.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.GATA3.without.motifs.quantile1.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.GATA3_without_motifs_123456_78_161bp_mast.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.GAT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TAA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TAA.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.TAA.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.TAA.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.TAA.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.TAA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TAA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TAG.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TAG.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.TAG.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.TAG.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.TAG.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.TAG.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TAG.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TAT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TAT.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.TAT.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.TAT.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.TAT.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.TAT.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TAT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TCT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TCT.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.TCT.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.TCT.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.TCT.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.TCT.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TCT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TTA.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TTA.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.TTA.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.TTA.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.TTA.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.TTA.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TTA.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
## closest.2nd.plus.TTT.to.1st.plus.GAT.GATA3.with.motif_1.bed
## closest.2nd.plus.TTT.to.1st.plus.GAT.GATA3.with.motif_2.bed
## closest.2nd.plus.TTT.to.1st.plus.GAT.GATA3.with.motif_4.bed
## closest.2nd.plus.TTT.to.1st.plus.GAT.GATA3.with.motif_5.bed
## closest.2nd.plus.TTT.to.1st.plus.GAT.GATA3.with.motif_6.bed
## closest.2nd.plus.TTT.to.1st.plus.GAT.indep.DHS.control.bed
## closest.2nd.plus.TTT.to.1st.plus.GAT.to.GATA3_without_motifs_123456_78.bed
density plot
xy plot
GATA3 peaks without motifs:
calculate_actual_frequency <- function(data) {
# Use table() to create a frequency table
actual_frequencies <- table(data)/length(data)
result <- data.frame(value = names(actual_frequencies), actual_freq = as.vector(actual_frequencies))
return(result)
}
df.peak.nomotif = data.frame(matrix(nrow = 0, ncol = 5))
colnames(df.peak.nomotif) = c("dis","anchor_3mer", "query_3mer","abs.dis", "actual_freq")
for (closest_2nd_dis in Sys.glob(file.path("./closest.2nd.plus*.to.1st.plus.*.GATA3_without_motifs_123456_78_161bp_mast.bed"))) {
print(closest_2nd_dis)
anchor_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.*.to.1st.plus.')[[1]][2]), ".GATA3_without_motifs_123456_78_161bp_mast.bed")[[1]][1]
print(anchor_3mer)
query_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.')[[1]][2]), ".to.1st.plus.*.GATA3_without_motifs_123456_78_161bp_mast.bed")[[1]][1]
print(query_3mer)
temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], anchor_3mer, query_3mer))
colnames(temp) = c("dis", "anchor_3mer", "query_3mer")
temp$dis=as.integer(temp$dis)
temp$abs.dis=abs(temp$dis)
actual_frequencies = calculate_actual_frequency(temp$abs.dis)
temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
df.peak.nomotif = rbind(df.peak.nomotif,temp1)
}
## [1] "./closest.2nd.plus.ATC.to.1st.plus.ATC.GATA3_without_motifs_123456_78_161bp_mast.bed"
## [1] "ATC"
## [1] "ATC"
## [1] "./closest.2nd.plus.ATC.to.1st.plus.GAT.GATA3_without_motifs_123456_78_161bp_mast.bed"
## [1] "GAT"
## [1] "ATC"
## [1] "./closest.2nd.plus.GAT.to.1st.plus.ATC.GATA3_without_motifs_123456_78_161bp_mast.bed"
## [1] "ATC"
## [1] "GAT"
## [1] "./closest.2nd.plus.GAT.to.1st.plus.GAT.GATA3_without_motifs_123456_78_161bp_mast.bed"
## [1] "GAT"
## [1] "GAT"
str(df.peak.nomotif)
## 'data.frame': 149232 obs. of 5 variables:
## $ abs.dis : int 3 3 3 3 3 3 3 3 3 3 ...
## $ dis : int 3 3 3 3 3 3 3 3 3 3 ...
## $ anchor_3mer: chr "ATC" "ATC" "ATC" "ATC" ...
## $ query_3mer : chr "ATC" "ATC" "ATC" "ATC" ...
## $ actual_freq: num 0.0143 0.0143 0.0143 0.0143 0.0143 ...
unique(df.peak.nomotif$anchor_3mer)
## [1] "ATC" "GAT"
unique(df.peak.nomotif$query_3mer)
## [1] "ATC" "GAT"
nrow(df.peak.nomotif)
## [1] 149232
head(df.peak.nomotif)
## abs.dis dis anchor_3mer query_3mer actual_freq
## 1 3 3 ATC ATC 0.01434009
## 2 3 3 ATC ATC 0.01434009
## 3 3 3 ATC ATC 0.01434009
## 4 3 3 ATC ATC 0.01434009
## 5 3 3 ATC ATC 0.01434009
## 6 3 3 ATC ATC 0.01434009
df.peak.nomotif$status="GATA3_peak_without_motifs"
DHS regions (neg ctrl):
df.ctrl = data.frame(matrix(nrow = 0, ncol = 5))
colnames(df.ctrl) = c("dis","anchor_3mer", "query_3mer","abs.dis", "actual_freq")
for (closest_2nd_dis in Sys.glob(file.path("./closest.2nd.plus*.to.1st.plus.*.indep.DHS.control.bed"))) {
print(closest_2nd_dis)
anchor_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.*.to.1st.plus.')[[1]][2]), ".indep.DHS.control.bed")[[1]][1]
print(anchor_3mer)
query_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.')[[1]][2]), ".to.1st.plus.*.indep.DHS.control.bed")[[1]][1]
print(query_3mer)
temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], anchor_3mer, query_3mer))
colnames(temp) = c("dis", "anchor_3mer", "query_3mer")
temp$dis=as.integer(temp$dis)
temp$abs.dis=abs(temp$dis)
actual_frequencies = calculate_actual_frequency(temp$abs.dis)
temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
df.ctrl = rbind(df.ctrl,temp1)
}
## [1] "./closest.2nd.plus.AAA.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "AAA"
## [1] "./closest.2nd.plus.AAT.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "AAT"
## [1] "./closest.2nd.plus.AGA.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "AGA"
## [1] "./closest.2nd.plus.ATA.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "ATA"
## [1] "./closest.2nd.plus.ATC.to.1st.plus.ATC.indep.DHS.control.bed"
## [1] "ATC"
## [1] "ATC"
## [1] "./closest.2nd.plus.ATC.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "ATC"
## [1] "./closest.2nd.plus.ATT.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "ATT"
## [1] "./closest.2nd.plus.CTA.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "CTA"
## [1] "./closest.2nd.plus.GAT.to.1st.plus.ATC.indep.DHS.control.bed"
## [1] "ATC"
## [1] "GAT"
## [1] "./closest.2nd.plus.GAT.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "GAT"
## [1] "./closest.2nd.plus.TAA.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "TAA"
## [1] "./closest.2nd.plus.TAG.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "TAG"
## [1] "./closest.2nd.plus.TAT.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "TAT"
## [1] "./closest.2nd.plus.TCT.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "TCT"
## [1] "./closest.2nd.plus.TTA.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "TTA"
## [1] "./closest.2nd.plus.TTT.to.1st.plus.GAT.indep.DHS.control.bed"
## [1] "GAT"
## [1] "TTT"
str(df.ctrl)
## 'data.frame': 926496 obs. of 5 variables:
## $ abs.dis : int 3 3 3 3 3 3 3 3 3 3 ...
## $ dis : int 3 3 3 3 3 3 3 3 3 3 ...
## $ anchor_3mer: chr "GAT" "GAT" "GAT" "GAT" ...
## $ query_3mer : chr "AAA" "AAA" "AAA" "AAA" ...
## $ actual_freq: num 0.0213 0.0213 0.0213 0.0213 0.0213 ...
unique(df.ctrl$anchor_3mer)
## [1] "GAT" "ATC"
unique(df.ctrl$query_3mer)
## [1] "AAA" "AAT" "AGA" "ATA" "ATC" "ATT" "CTA" "GAT" "TAA" "TAG" "TAT" "TCT"
## [13] "TTA" "TTT"
nrow(df.ctrl)
## [1] 926496
head(df.ctrl)
## abs.dis dis anchor_3mer query_3mer actual_freq
## 1 3 3 GAT AAA 0.02125859
## 2 3 3 GAT AAA 0.02125859
## 3 3 3 GAT AAA 0.02125859
## 4 3 3 GAT AAA 0.02125859
## 5 3 3 GAT AAA 0.02125859
## 6 3 3 GAT AAA 0.02125859
df.ctrl$status="MCF7_DHS_regions"
df.plot=rbind(df.peak.nomotif, df.ctrl)
df.plot$pattern = paste0(df.plot$anchor_3mer, "-", df.plot$query_3mer)
df.plot$pattern = factor(df.plot$pattern, levels = c("ATC-ATC", "GAT-ATC", "ATC-GAT", "GAT-GAT"))
df.plot$status = factor(df.plot$status, levels = c("GATA3_peak_without_motifs", "MCF7_DHS_regions"))
nrow(df.plot)
## [1] 1075728
nrow(df.plot[!duplicated(df.plot), ])
## [1] 14862
summary(df.plot[!duplicated(df.plot), ])
## abs.dis dis anchor_3mer query_3mer
## Min. : 1.0 Min. : 1.0 Length:14862 Length:14862
## 1st Qu.: 187.2 1st Qu.: 187.2 Class :character Class :character
## Median : 375.0 Median : 375.0 Mode :character Mode :character
## Mean : 505.1 Mean : 505.1
## 3rd Qu.: 596.0 3rd Qu.: 596.0
## Max. :128938.0 Max. :128938.0
## actual_freq status pattern
## Min. :1.727e-05 GATA3_peak_without_motifs: 2610 ATC-ATC:1216
## 1st Qu.:3.454e-05 MCF7_DHS_regions :12252 GAT-ATC:1317
## Median :1.727e-04 ATC-GAT:1280
## Mean :1.346e-03 GAT-GAT:1176
## 3rd Qu.:1.179e-03 NA's :9873
## Max. :2.016e-01
library(lattice)
library(latticeExtra)
pdf('xy_closest_2nd_3mer_to_closest_1st_3mer_to_GATA3_peaks_without_motifs.pdf', width=15,height=5)
print(
xyplot(actual_freq ~ abs.dis | pattern,
#data = df.plot[!duplicated(df.plot), ],
data = df.plot,
groups = status,
auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
aspect = 1,
xlim=c(0,50),
ylim=c(0, 0.15),
layout=c(4,1),
#type = c('p', 'smooth'),
xlab = "relative distance (bp) between zinc finger (anchor at G/C)",
ylab="Frequency",
#main="Independent DHS Regions",
between=list(y=1.0),
scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.line = list(col=c("red", "black"), lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
panel.densityplot(x, data = df.plot,
from=0,
to=50,
lty = c(1),
lwd=2,
darg=list(bw = "nrd0", kernel="gaussian"),
type = "count",
col=c("pink","grey"), ...)
panel.xyplot(x, y,
col=c("red","black"),
pch=18,
cex=0.6,...)
})
)
dev.off()
## quartz_off_screen
## 2
We can also demonstrate the relative enrichment by subtracting the actual frequency of GATA3 peaks from that of the DHS regions at each relative distance.
# GATA3 peaks
uniq.df.peak.nomotif=df.peak.nomotif[!duplicated(df.peak.nomotif), 1:5]
#DHS regions
uniq.df.ctrl=df.ctrl[!duplicated(df.ctrl), 1:5]
colnames(uniq.df.ctrl)[5]="actual_freq_DHS"
#calculate the relative frequency
#by subtraction of actual frequency between GATA3 peaks and DHS regions
df.plot2=merge(uniq.df.peak.nomotif, uniq.df.ctrl, by=c("abs.dis", "dis", "anchor_3mer", "query_3mer"), all.x = TRUE)
df.plot2$rel_freq <- ifelse(is.na(df.plot2$actual_freq_DHS), NA, df.plot2$actual_freq - df.plot2$actual_freq_DHS)
df.plot2$pattern = paste0(df.plot2$anchor_3mer, "-", df.plot2$query_3mer)
df.plot2$pattern = factor(df.plot2$pattern, levels = c("ATC-ATC", "GAT-ATC", "ATC-GAT", "GAT-GAT"))
library(lattice)
library(latticeExtra)
pdf(paste0('xy_closest_2nd_3mer_to_closest_1st_3mer_to_GATA3_peaks_without_motifs_compare_to_DHS.pdf'), width=15,height=5)
print(xyplot(rel_freq ~ abs.dis | pattern,
data = df.plot2,
#groups = pattern,
#auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
aspect = 1,
xlim=c(0,50),
ylim=c(0, 0.1),
layout=c(4,1),
#type = c('p', 'smooth'),
xlab = "relative distance (bp) between zinc finger (anchor at G/C)",
ylab="Relative Frequency (GATA3 peaks - DHS regions)",
#main=paste0("GATA3 peak with ", motif),
between=list(y=1.0),
scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
#par.settings = list(superpose.line = list(col=c("pink", "skyblue"), lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 15, 1)), col = "grey90")
panel.densityplot(x, data = df.plot2,
from=0,
to=50,
lty = c(1),
lwd=2,
darg=list(bw = "nrd0", kernel="gaussian"),
type = "count",
col="pink", ...)
panel.xyplot(x, y,
col="red",
pch=18,
cex=0.6,...)
})
)
dev.off()
## quartz_off_screen
## 2
It seems that these peaks have fewer enriched spaced-3mers; however, they could still be enriched by single sites as GATA binding elements.
ls closest.1st*.bed
head closest.1st.minus.ATC.to.GATA3_without_motifs_123456_78_161bp_mast.bed
wc -l closest.1st.minus.ATC.to.GATA3_without_motifs_123456_78_161bp_mast.bed
wc -l closest.1st.minus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed
## closest.1st.minus.ATC.to.GATA3_without_motifs_123456_78_161bp_mast.bed
## closest.1st.minus.ATC.to.GATA3_without_motifs_quantile1.bed
## closest.1st.minus.ATC.to.indep.DHS.control.consensus.bed
## closest.1st.minus.GAT.to.GATA3.with.motif_1.bed
## closest.1st.minus.GAT.to.GATA3.with.motif_2.bed
## closest.1st.minus.GAT.to.GATA3.with.motif_4.bed
## closest.1st.minus.GAT.to.GATA3.with.motif_5.bed
## closest.1st.minus.GAT.to.GATA3.with.motif_6.bed
## closest.1st.minus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed
## closest.1st.minus.GAT.to.GATA3_without_motifs_quantile1.bed
## closest.1st.minus.GAT.to.indep.DHS.control.consensus.bed
## closest.1st.plus.ATC.to.GATA3_without_motifs_123456_78_161bp_mast.bed
## closest.1st.plus.ATC.to.GATA3_without_motifs_quantile1.bed
## closest.1st.plus.ATC.to.indep.DHS.control.consensus.bed
## closest.1st.plus.GAT.to.GATA3.with.motif_1.bed
## closest.1st.plus.GAT.to.GATA3.with.motif_2.bed
## closest.1st.plus.GAT.to.GATA3.with.motif_4.bed
## closest.1st.plus.GAT.to.GATA3.with.motif_5.bed
## closest.1st.plus.GAT.to.GATA3.with.motif_6.bed
## closest.1st.plus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed
## closest.1st.plus.GAT.to.GATA3_without_motifs_quantile1.bed
## closest.1st.plus.GAT.to.indep.DHS.control.consensus.bed
## chr1 827380 827381 chr1 827302 827305 36 36 - ATC 76
## chr1 916769 916770 chr1 916719 916722 36 36 - ATC 48
## chr1 924853 924854 chr1 924740 924743 36 36 - ATC 111
## chr1 966653 966654 chr1 966718 966721 36 36 - ATC 65
## chr1 999508 999509 chr1 999577 999580 36 36 - ATC 69
## chr1 1000536 1000537 chr1 1000627 1000630 36 36 - ATC 91
## chr1 1001891 1001892 chr1 1001937 1001940 36 36 - ATC 46
## chr1 1013265 1013266 chr1 1013247 1013250 36 36 - ATC 16
## chr1 1013580 1013581 chr1 1013584 1013587 36 36 - ATC 4
## chr1 1020187 1020188 chr1 1020330 1020333 36 36 - ATC 143
## 37308 closest.1st.minus.ATC.to.GATA3_without_motifs_123456_78_161bp_mast.bed
## 37308 closest.1st.minus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed
xy plot
GATA3 peaks without motifs:
calculate_actual_frequency <- function(data) {
# Use table() to create a frequency table
actual_frequencies <- table(data)/length(data)
result <- data.frame(value = names(actual_frequencies), actual_freq = as.vector(actual_frequencies))
return(result)
}
df.peak.nomotif = data.frame(matrix(nrow = 0, ncol = 5))
colnames(df.peak.nomotif) = c("dis","closest_3mer", "abs.dis", "actual_freq")
for (closest_1st_dis in Sys.glob(file.path("./closest.1st.*.to.GATA3_without_motifs_123456_78_161bp_mast.bed"))) {
print(closest_1st_dis)
closest_3mer =strsplit((strsplit(strsplit(closest_1st_dis, "/")[[1]][length(strsplit(closest_1st_dis, "/")[[1]])], 'closest.1st.')[[1]][2]), ".to.GATA3_without_motifs_123456_78_161bp_mast.bed")[[1]][1]
print(closest_3mer)
temp = as.data.frame(cbind(read.table(closest_1st_dis, header=F, comment.char='')[,11], closest_3mer))
colnames(temp) = c("dis", "closest_3mer")
temp$dis=as.integer(temp$dis)
temp$abs.dis=abs(temp$dis)
actual_frequencies = calculate_actual_frequency(temp$abs.dis)
temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
df.peak.nomotif = rbind(df.peak.nomotif,temp1)
}
## [1] "./closest.1st.minus.ATC.to.GATA3_without_motifs_123456_78_161bp_mast.bed"
## [1] "minus.ATC"
## [1] "./closest.1st.minus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed"
## [1] "minus.GAT"
## [1] "./closest.1st.plus.ATC.to.GATA3_without_motifs_123456_78_161bp_mast.bed"
## [1] "plus.ATC"
## [1] "./closest.1st.plus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed"
## [1] "plus.GAT"
str(df.peak.nomotif)
## 'data.frame': 149232 obs. of 4 variables:
## $ abs.dis : int 0 0 0 0 0 0 0 0 0 0 ...
## $ dis : int 0 0 0 0 0 0 0 0 0 0 ...
## $ closest_3mer: chr "minus.ATC" "minus.ATC" "minus.ATC" "minus.ATC" ...
## $ actual_freq : num 0.0386 0.0386 0.0386 0.0386 0.0386 ...
unique(df.peak.nomotif$closest_3mer)
## [1] "minus.ATC" "minus.GAT" "plus.ATC" "plus.GAT"
nrow(df.peak.nomotif)
## [1] 149232
head(df.peak.nomotif)
## abs.dis dis closest_3mer actual_freq
## 1 0 0 minus.ATC 0.03857082
## 2 0 0 minus.ATC 0.03857082
## 3 0 0 minus.ATC 0.03857082
## 4 0 0 minus.ATC 0.03857082
## 5 0 0 minus.ATC 0.03857082
## 6 0 0 minus.ATC 0.03857082
df.peak.nomotif$status="GATA3_peak_without_motifs"
DHS regions (neg ctrl):
df.ctrl = data.frame(matrix(nrow = 0, ncol = 5))
colnames(df.ctrl) = c("dis","closest_3mer", "abs.dis", "actual_freq")
for (closest_1st_dis in Sys.glob(file.path("./closest.1st.*.to.indep.DHS.control.consensus.bed"))) {
print(closest_1st_dis)
closest_3mer =strsplit((strsplit(strsplit(closest_1st_dis, "/")[[1]][length(strsplit(closest_1st_dis, "/")[[1]])], 'closest.1st.')[[1]][2]), ".to.indep.DHS.control.consensus.bed")[[1]][1]
print(closest_3mer)
temp = as.data.frame(cbind(read.table(closest_1st_dis, header=F, comment.char='')[,11], closest_3mer))
colnames(temp) = c("dis", "closest_3mer")
temp$dis=as.integer(temp$dis)
temp$abs.dis=abs(temp$dis)
actual_frequencies = calculate_actual_frequency(temp$abs.dis)
temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
df.ctrl = rbind(df.ctrl,temp1)
}
## [1] "./closest.1st.minus.ATC.to.indep.DHS.control.consensus.bed"
## [1] "minus.ATC"
## [1] "./closest.1st.minus.GAT.to.indep.DHS.control.consensus.bed"
## [1] "minus.GAT"
## [1] "./closest.1st.plus.ATC.to.indep.DHS.control.consensus.bed"
## [1] "plus.ATC"
## [1] "./closest.1st.plus.GAT.to.indep.DHS.control.consensus.bed"
## [1] "plus.GAT"
str(df.ctrl)
## 'data.frame': 231624 obs. of 4 variables:
## $ abs.dis : int 0 0 0 0 0 0 0 0 0 0 ...
## $ dis : int 0 0 0 0 0 0 0 0 0 0 ...
## $ closest_3mer: chr "minus.ATC" "minus.ATC" "minus.ATC" "minus.ATC" ...
## $ actual_freq : num 0.0262 0.0262 0.0262 0.0262 0.0262 ...
unique(df.ctrl$closest_3mer)
## [1] "minus.ATC" "minus.GAT" "plus.ATC" "plus.GAT"
nrow(df.ctrl)
## [1] 231624
head(df.ctrl)
## abs.dis dis closest_3mer actual_freq
## 1 0 0 minus.ATC 0.0262149
## 2 0 0 minus.ATC 0.0262149
## 3 0 0 minus.ATC 0.0262149
## 4 0 0 minus.ATC 0.0262149
## 5 0 0 minus.ATC 0.0262149
## 6 0 0 minus.ATC 0.0262149
df.ctrl$status="MCF7_DHS_regions"
df.plot=rbind(df.peak.nomotif, df.ctrl)
df.plot$closest_3mer = factor(df.plot$closest_3mer, levels = c("plus.GAT", "minus.ATC" , "minus.GAT", "plus.ATC"))
df.plot$status = factor(df.plot$status, levels = c("GATA3_peak_without_motifs", "MCF7_DHS_regions"))
nrow(df.plot)
## [1] 380856
nrow(df.plot[!duplicated(df.plot), ])
## [1] 8545
summary(df.plot[!duplicated(df.plot), ])
## abs.dis dis closest_3mer actual_freq
## Min. : 0 Min. : 0 plus.GAT :2124 Min. :1.727e-05
## 1st Qu.: 267 1st Qu.: 267 minus.ATC:2141 1st Qu.:1.727e-05
## Median : 1028 Median : 1028 minus.GAT:2151 Median :1.727e-05
## Mean : 28226 Mean : 28226 plus.ATC :2129 Mean :9.362e-04
## 3rd Qu.: 19605 3rd Qu.: 19605 3rd Qu.:1.900e-04
## Max. :890240 Max. :890240 Max. :3.860e-02
## status
## GATA3_peak_without_motifs:2454
## MCF7_DHS_regions :6091
##
##
##
##
library(lattice)
library(latticeExtra)
pdf('xy_closest_1st_3mer_to_peaks_without_motifs.pdf', width=15,height=5)
print(
xyplot(actual_freq ~ abs.dis | closest_3mer,
#data = df.plot[!duplicated(df.plot), ],
data = df.plot,
groups = status,
auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
aspect = 1,
xlim=c(0,50),
ylim=c(0, 0.15),
layout=c(4,1),
#type = c('p', 'smooth'),
xlab = "relative distance (bp) between zinc finger (anchor at G/C)",
ylab="Frequency",
#main="Independent DHS Regions",
between=list(y=1.0),
scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.line = list(col=c("red", "black"), lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
panel.densityplot(x, data = df.plot,
from=0,
to=50,
lty = c(1),
lwd=2,
darg=list(bw = "nrd0", kernel="gaussian"),
type = "count",
col=c("pink","grey"), ...)
panel.xyplot(x, y,
col=c("red","black"),
pch=18,
cex=0.6,...)
})
)
dev.off()
## quartz_off_screen
## 2
Again, we can demonstrate the relative enrichment of closest 3mer by subtracting the actual frequency of GATA3 peaks from that of the DHS regions at each relative distance.
# GATA3 peaks
uniq.df.peak.nomotif=df.peak.nomotif[!duplicated(df.peak.nomotif), 1:4]
#DHS regions
uniq.df.ctrl=df.ctrl[!duplicated(df.ctrl), 1:4]
colnames(uniq.df.ctrl)[4]="actual_freq_DHS"
#calculate the relative frequency
#by subtraction of actual frequency between GATA3 peaks and DHS regions
df.plot2=merge(uniq.df.peak.nomotif, uniq.df.ctrl, by=c("abs.dis", "dis", "closest_3mer"), all.x = TRUE)
df.plot2$rel_freq <- ifelse(is.na(df.plot2$actual_freq_DHS), NA, df.plot2$actual_freq - df.plot2$actual_freq_DHS)
df.plot2$closest_3mer = factor(df.plot2$closest_3mer, levels = c("plus.GAT", "minus.ATC" , "minus.GAT", "plus.ATC"))
library(lattice)
library(latticeExtra)
pdf(paste0('xy_closest_1st_3mer_GATA3_peaks_without_motifs_compare_to_DHS.pdf'), width=15,height=5)
print(xyplot(rel_freq ~ abs.dis | closest_3mer,
data = df.plot2,
#groups = pattern,
#auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
aspect = 1,
xlim=c(0,50),
ylim=c(0, 0.1),
layout=c(4,1),
#type = c('p', 'smooth'),
xlab = "relative distance (bp) between zinc finger (anchor at G/C)",
ylab="Relative Frequency (GATA3 peaks - DHS regions)",
#main=paste0("GATA3 peak with ", motif),
between=list(y=1.0),
scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
#par.settings = list(superpose.line = list(col=c("pink", "skyblue"), lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(0, 15, 1)), col = "grey90")
panel.densityplot(x, data = df.plot2,
from=0,
to=50,
lty = c(1),
lwd=2,
darg=list(bw = "nrd0", kernel="gaussian"),
type = "count",
col="pink", ...)
panel.xyplot(x, y,
col="red",
pch=18,
cex=0.6,...)
})
)
dev.off()
## quartz_off_screen
## 2
We can observe the enrichment, but the frequency of this enrichment is low. This could be attributed to some low-intensity peaks that are not bound by GATA3; they were labeled as GATA3 peaks because the MACS3 peak calling software identified peaks based on the p-value we set. These peaks could saturate the enrichment frequency because they increase the denominator when calculating the frequency.
When we divide these GATA3 peaks (without motifs) into five quantiles based on their intensity, it becomes evident that peaks with higher intensity also exhibit stronger enrichment in observing GAT/ATC close to the peak summit.
We have divided the GATA3 peaks without motifs into 5 quantiles based on the ranked intensity in January_updates 3.2.2. We have previously made CDF plots to visualize the enrichment.
quantile0.2_summits.bed
quantile0.6_summits.bed
quantile1_summits.bed
quantile0.4_summits.bed
quantile0.8_summits.bed
240218_closestBed3.R:
(cd /home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/GAT_ATC_240218/quantiles)
#!/usr/bin/env Rscript
Args=commandArgs(TRUE)
# closestBed function
bedTools.closest <- function(functionstring="/home/FCAM/ssun/packages/bedtools2/bin/closestBed",bed1,bed2,opt.string="") {
options(scipen =99) # not use scientific notation when writing out
#write bed formatted data.frames to tempfile
write.table(bed1,file= 'a.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
write.table(bed2,file= 'b.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
# create the command string and call the command using system()
# the command sort a and b file by coordinates
command1=paste('sort -k1,1 -k2,2n', 'a.file.bed', '> a.file.sorted.bed')
cat(command1,"\n") #sort -k1,1 -k2,2n a.file.bed > a.file.sorted.bed
try(system(command1))
command2=paste('sort -k1,1 -k2,2n', 'b.file.bed', '> b.file.sorted.bed')
cat(command2,"\n")
try(system(command2))
# the command call closestBed on bed1 and bed2
command=paste(functionstring, opt.string,"-a",'a.file.sorted.bed',"-b",'b.file.sorted.bed',">",'out.file.bed',sep=" ")
cat(command,"\n")
try(system(command))
res=read.table('out.file.bed',header=F, comment.char='')
# remove intermediate files
command3=paste('rm', 'a.file.bed', 'b.file.bed', 'a.file.sorted.bed', 'b.file.sorted.bed', 'out.file.bed')
cat(command3,"\n")
try(system(command3))
colnames(res) = c(colnames(bed1), colnames(bed2), 'dis' )
return(res)
}
dir1="/labs/Guertin/siyu/Sathyan_GATA3_ChIP_pool1_pool2/overrep_3mer/hg38_full_kmer3_rs1000/seqdump/"
dir2="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/"
#dir3="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/ENCODE_DHS_GSE29692/"
prioritized_triplets = c("GAT", "ATC")
library(bigWig)
quantiles = c("quantile1", "quantile0.8","quantile0.6","quantile0.4","quantile0.2")
for (triplet in prioritized_triplets){
print(triplet)
# 3mer genome coordinates
plus.triplet.file=read.table(file = Sys.glob(file.path(paste0(dir1,"hg38.3.3.3plus.*_",triplet, ".bed"))), sep="\t", header=FALSE)
minus.triplet.file=read.table(file = Sys.glob(file.path(paste0(dir1,"hg38.3.3.3minus.*_",triplet ,".bed"))), sep="\t", header=FALSE)
for (quantile in quantiles) {
print(quantile)
# peak summits
GATA3_peak_summits=center.bed(read.table(paste0(dir2, quantile, "_summits.bed"), header=FALSE), upstreamWindow = 0, downstreamWindow = 0)
# closestBed--1st closest plus
closest.1st.plus.triplet.to.peak=bedTools.closest(bed1 = GATA3_peak_summits[,1:3], bed2 = plus.triplet.file, opt.string = '-d -t first')
write.table(closest.1st.plus.triplet.to.peak,file= paste0('closest.1st.plus.',triplet,'.to.GATA3_without_motifs_', quantile, '.bed'), quote=F,sep="\t",col.names=F,row.names=F)
# closestBed--1st closest minus
closest.1st.minus.triplet.to.peak=bedTools.closest(bed1 = GATA3_peak_summits[,1:3], bed2 =minus.triplet.file, opt.string = '-d -t first')
write.table(closest.1st.minus.triplet.to.peak,file= paste0('closest.1st.minus.',triplet,'.to.GATA3_without_motifs_', quantile, '.bed'), quote=F,sep="\t",col.names=F,row.names=F)
}
}
runR.sh
#!/bin/bash
#SBATCH --job-name=runR.sh # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 8
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=200G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o runR.sh_%j.out
#SBATCH -e runR.sh_%j.err
module load R/4.1.2
Rscript 240218_closestBed3.R
bedtools subtract.#!/bin/bash
#SBATCH --job-name=remove_1st_3mer.sh # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 16
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=200G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o remove_1st_3mer.sh_%j.out
#SBATCH -e remove_1st_3mer.sh_%j.err
input_dir1=/labs/Guertin/siyu/Sathyan_GATA3_ChIP_pool1_pool2/overrep_3mer/hg38_full_kmer3_rs1000/seqdump/
input_dir2=/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/GAT_ATC_240218/quantiles/
prioritized_triplets=("GAT" "ATC")
quantiles=("quantile1" "quantile0.8" "quantile0.6" "quantile0.4" "quantile0.2")
# Use a for loop to iterate over the list
module load bedtools
for triplet in "${prioritized_triplets[@]}"
do
echo $triplet
for quantile in "${quantiles[@]}"
do
echo $quantile
# plus
sort -k1,1 -k2,2n ${input_dir1}hg38.3.3.3plus*${triplet}.bed > hg38.3.3.3plus.${triplet}.sorted.bed
awk '{print $4, $5, $6, $7, $8, $9, $10}' ${input_dir2}closest.1st.plus.${triplet}.to.GATA3_without_motifs_${quantile}.bed | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.plus.${triplet}.to.GATA3_without_motifs_${quantile}.uniq.sorted.bed
bedtools subtract -a hg38.3.3.3plus.${triplet}.sorted.bed -b closest.1st.plus.${triplet}.to.GATA3_without_motifs_${quantile}.uniq.sorted.bed -f 1.00 -s > hg38.3.3.3plus.${triplet}_without_1st_plus_${triplet}_to_GATA3_without_motifs_${quantile}.bed
rm hg38.3.3.3plus.${triplet}.sorted.bed
rm closest.1st.plus.${triplet}.to.GATA3_without_motifs_${quantile}.uniq.sorted.bed
# minus
sort -k1,1 -k2,2n ${input_dir1}hg38.3.3.3minus*${triplet}.bed > hg38.3.3.3minus.${triplet}.sorted.bed
awk '{print $4, $5, $6, $7, $8, $9, $10}' ${input_dir2}closest.1st.minus.${triplet}.to.GATA3_without_motifs_${quantile}.bed | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.minus.${triplet}.to.GATA3_without_motifs_${quantile}.uniq.sorted.bed
bedtools subtract -a hg38.3.3.3minus.${triplet}.sorted.bed -b closest.1st.minus.${triplet}.to.GATA3_without_motifs_${quantile}.uniq.sorted.bed -f 1.00 -s > hg38.3.3.3minus.${triplet}_without_1st_minus_${triplet}_to_GATA3_without_motifs_${quantile}.bed
rm hg38.3.3.3minus.${triplet}.sorted.bed
rm closest.1st.minus.${triplet}.to.GATA3_without_motifs_${quantile}.uniq.sorted.bed
done
done
240218_closestBed4.R
#!/usr/bin/env Rscript
Args=commandArgs(TRUE)
bedTools.closest <- function(functionstring="/home/FCAM/ssun/packages/bedtools2/bin/closestBed",bed1,bed2,opt.string="") {
options(scipen =99) # not use scientific notation when writing out
#write bed formatted data.frames to tempfile
write.table(bed1,file= 'a.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
write.table(bed2,file= 'b.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
# create the command string and call the command using system()
# the command sort a and b file by coordinates
command1=paste('sort -k1,1 -k2,2n', 'a.file.bed', '> a.file.sorted.bed')
cat(command1,"\n") #sort -k1,1 -k2,2n a.file.bed > a.file.sorted.bed
try(system(command1))
command2=paste('sort -k1,1 -k2,2n', 'b.file.bed', '> b.file.sorted.bed')
cat(command2,"\n")
try(system(command2))
# the command call closestBed on bed1 and bed2
command=paste(functionstring, opt.string,"-a",'a.file.sorted.bed',"-b",'b.file.sorted.bed',">",'out.file.bed',sep=" ")
cat(command,"\n")
try(system(command))
res=read.table('out.file.bed',header=F, comment.char='')
# remove intermediate files
command3=paste('rm', 'a.file.bed', 'b.file.bed', 'a.file.sorted.bed', 'b.file.sorted.bed', 'out.file.bed')
cat(command3,"\n")
try(system(command3))
colnames(res) = c(colnames(bed1), colnames(bed2), 'dis' )
return(res)
}
#library(lattice)
#library(latticeExtra)
#library(Biostrings)
library(bigWig)
# List of prioritized triplets
prioritized_triplets <- c("GAT", "ATC")
# List to store non-redundant 6mers
all_6mers <- list()
for (i in 1:length(prioritized_triplets)) {
for (j in 1:length(prioritized_triplets)) {
pair <- c(prioritized_triplets[i], prioritized_triplets[j])
# Combine triplets to form a 6mer
sixmer <- paste(pair, collapse = "")
all_6mers <- c(all_6mers, list(sixmer))
}
}
# Create data frame with first 3 bases and last 3 bases
first_3_bases <- substr(all_6mers, 1, 3)
last_3_bases <- substr(all_6mers, 4, 6)
df <- data.frame(First_3_bases = first_3_bases, Last_3_bases = last_3_bases)
# nested loop
dir="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/GAT_ATC_240218/quantiles/"
quantiles = c("quantile1", "quantile0.8","quantile0.6","quantile0.4","quantile0.2")
win=read.csv(paste0('/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/closest_other_3mer/closest_2nd_other_3mer/pattern_anchor_at_GorC_for_bigWig_pkg.csv'))
# GATA3 peaks without motifs
for (quantile in quantiles) {
print(quantile)
for (i in 1:nrow(df)){
pattern1=df[i,1]
pattern2=df[i,2]
# anchor position: closest +/- pattern1 relative to G/C
print(pattern1)
closest_plus_3mer_to_GATA3_peak_summits=fiveprime.bed(read.table(paste0(dir, "closest.1st.plus.", pattern1, ".to.GATA3_without_motifs_", quantile, ".bed"), header=FALSE)[,4:11], upstreamWindow = win[win$pattern==pattern1, "plus_upstream"] , downstreamWindow = win[win$pattern==pattern1, "plus_downstream"])
# query 3mer coordinates on genome (without the overlapped closest 3mer coordinates) - relative to G/C
print(pattern2)
plus.3mer.file=fiveprime.bed(read.table(file = paste0(dir, "hg38.3.3.3plus.", pattern2, "_without_1st_plus_", pattern2, "_to_GATA3_without_motifs_", quantile, ".bed"), sep="\t", header=FALSE), upstreamWindow = win[win$pattern==pattern2, "plus_upstream"] , downstreamWindow = win[win$pattern==pattern2, "plus_downstream"])
# 2nd closest plus 3mer to closest plus 3mer
closest.2nd.plus.3mer.to.1st.plus.3mer.GATA3.without.motif=bedTools.closest(bed1 = closest_plus_3mer_to_GATA3_peak_summits[,1:3], bed2 = plus.3mer.file, opt.string = '-d -t first')
write.table(closest.2nd.plus.3mer.to.1st.plus.3mer.GATA3.without.motif, file= paste0("closest.2nd.plus.", pattern2, ".to.1st.plus.", pattern1, ".GATA3.without.motifs.", quantile, ".bed"), quote=F,sep="\t",col.names=F,row.names=F)
}
}
runR.sh
#!/bin/bash
#SBATCH --job-name=runR.sh # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 16
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=200G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o runR.sh_%j.out
#SBATCH -e runR.sh_%j.err
module load R/4.1.2
Rscript 240218_closestBed4.R
CDF plots already made.
xy plots
GATA3 peaks without motifs in quantile 1:
calculate_actual_frequency <- function(data) {
# Use table() to create a frequency table
actual_frequencies <- table(data)/length(data)
result <- data.frame(value = names(actual_frequencies), actual_freq = as.vector(actual_frequencies))
return(result)
}
df.peak.nomotif = data.frame(matrix(nrow = 0, ncol = 5))
colnames(df.peak.nomotif) = c("dis","anchor_3mer", "query_3mer","abs.dis", "actual_freq")
for (closest_2nd_dis in Sys.glob(file.path("./closest.2nd.plus*.to.1st.plus.*.GATA3.without.motifs.quantile1.bed"))) {
print(closest_2nd_dis)
anchor_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.*.to.1st.plus.')[[1]][2]), ".GATA3.without.motifs.quantile1.bed")[[1]][1]
print(anchor_3mer)
query_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.')[[1]][2]), ".to.1st.plus.*.GATA3.without.motifs.quantile1.bed")[[1]][1]
print(query_3mer)
temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], anchor_3mer, query_3mer))
colnames(temp) = c("dis", "anchor_3mer", "query_3mer")
temp$dis=as.integer(temp$dis)
temp$abs.dis=abs(temp$dis)
actual_frequencies = calculate_actual_frequency(temp$abs.dis)
temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
df.peak.nomotif = rbind(df.peak.nomotif,temp1)
}
str(df.peak.nomotif)
unique(df.peak.nomotif$anchor_3mer)
unique(df.peak.nomotif$query_3mer)
nrow(df.peak.nomotif)
head(df.peak.nomotif)
df.peak.nomotif$status="GATA3_peak_without_motifs_quantile1"
DHS regions (neg ctrl):
df.ctrl = data.frame(matrix(nrow = 0, ncol = 5))
colnames(df.ctrl) = c("dis","anchor_3mer", "query_3mer","abs.dis", "actual_freq")
for (closest_2nd_dis in Sys.glob(file.path("./closest.2nd.plus*.to.1st.plus.*.indep.DHS.control.bed"))) {
print(closest_2nd_dis)
anchor_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.*.to.1st.plus.')[[1]][2]), ".indep.DHS.control.bed")[[1]][1]
print(anchor_3mer)
query_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.')[[1]][2]), ".to.1st.plus.*.indep.DHS.control.bed")[[1]][1]
print(query_3mer)
temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], anchor_3mer, query_3mer))
colnames(temp) = c("dis", "anchor_3mer", "query_3mer")
temp$dis=as.integer(temp$dis)
temp$abs.dis=abs(temp$dis)
actual_frequencies = calculate_actual_frequency(temp$abs.dis)
temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
df.ctrl = rbind(df.ctrl,temp1)
}
str(df.ctrl)
unique(df.ctrl$anchor_3mer)
unique(df.ctrl$query_3mer)
nrow(df.ctrl)
head(df.ctrl)
df.ctrl$status="MCF7_DHS_regions"
df.plot=rbind(df.peak.nomotif, df.ctrl)
df.plot$pattern = paste0(df.plot$anchor_3mer, "-", df.plot$query_3mer)
df.plot$pattern = factor(df.plot$pattern, levels = c("ATC-ATC", "GAT-ATC", "ATC-GAT", "GAT-GAT"))
df.plot$status = factor(df.plot$status, levels = c("GATA3_peak_without_motifs_quantile1", "MCF7_DHS_regions"))
nrow(df.plot)
nrow(df.plot[!duplicated(df.plot), ])
summary(df.plot[!duplicated(df.plot), ])
library(lattice)
library(latticeExtra)
pdf('xy_closest_2nd_3mer_to_closest_1st_3mer_to_GATA3_peaks_without_motifs_quantile1.pdf', width=15,height=5)
print(
xyplot(actual_freq ~ abs.dis | pattern,
#data = df.plot[!duplicated(df.plot), ],
data = df.plot,
groups = status,
auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
aspect = 1,
xlim=c(0,50),
ylim=c(0, 0.15),
layout=c(4,1),
#type = c('p', 'smooth'),
xlab = "relative distance (bp) between zinc finger (anchor at G/C)",
ylab="Frequency",
#main="Independent DHS Regions",
between=list(y=1.0),
scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.line = list(col=c("red", "black"), lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
panel.densityplot(x, data = df.plot,
from=0,
to=50,
lty = c(1),
lwd=2,
darg=list(bw = "nrd0", kernel="gaussian"),
type = "count",
col=c("pink","grey"), ...)
panel.xyplot(x, y,
col=c("red","black"),
pch=18,
cex=0.6,...)
})
)
dev.off()
Relative enrichment by subtracting the actual frequency of GATA3 peaks from that of the DHS regions at each relative distance.
# GATA3 peaks
uniq.df.peak.nomotif=df.peak.nomotif[!duplicated(df.peak.nomotif), 1:5]
#DHS regions
uniq.df.ctrl=df.ctrl[!duplicated(df.ctrl), 1:5]
colnames(uniq.df.ctrl)[5]="actual_freq_DHS"
#calculate the relative frequency
#by subtraction of actual frequency between GATA3 peaks and DHS regions
df.plot2=merge(uniq.df.peak.nomotif, uniq.df.ctrl, by=c("abs.dis", "dis", "anchor_3mer", "query_3mer"), all.x = TRUE)
df.plot2$rel_freq <- ifelse(is.na(df.plot2$actual_freq_DHS), NA, df.plot2$actual_freq - df.plot2$actual_freq_DHS)
df.plot2$pattern = paste0(df.plot2$anchor_3mer, "-", df.plot2$query_3mer)
df.plot2$pattern = factor(df.plot2$pattern, levels = c("ATC-ATC", "GAT-ATC", "ATC-GAT", "GAT-GAT"))
library(lattice)
library(latticeExtra)
pdf(paste0('xy_closest_2nd_3mer_to_closest_1st_3mer_to_GATA3_peaks_without_motifs_quantile1_compare_to_DHS.pdf'), width=15,height=5)
print(xyplot(rel_freq ~ abs.dis | pattern,
data = df.plot2,
#groups = pattern,
#auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
aspect = 1,
xlim=c(0,50),
ylim=c(0, 0.1),
layout=c(4,1),
#type = c('p', 'smooth'),
xlab = "relative distance (bp) between zinc finger (anchor at G/C)",
ylab="Relative Frequency (GATA3 peaks - DHS regions)",
#main=paste0("GATA3 peak with ", motif),
between=list(y=1.0),
scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
#par.settings = list(superpose.line = list(col=c("pink", "skyblue"), lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 15, 1)), col = "grey90")
panel.densityplot(x, data = df.plot2,
from=0,
to=50,
lty = c(1),
lwd=2,
darg=list(bw = "nrd0", kernel="gaussian"),
type = "count",
col="pink", ...)
panel.xyplot(x, y,
col="red",
pch=18,
cex=0.6,...)
panel.abline(h=0.04, col = "red")
})
)
dev.off()
GATA3 peaks without motifs in other quantile:
DHS regions (neg ctrl):
calculate_actual_frequency <- function(data) {
# Use table() to create a frequency table
actual_frequencies <- table(data)/length(data)
result <- data.frame(value = names(actual_frequencies), actual_freq = as.vector(actual_frequencies))
return(result)
}
df.ctrl = data.frame(matrix(nrow = 0, ncol = 5))
colnames(df.ctrl) = c("dis","anchor_3mer", "query_3mer","abs.dis", "actual_freq")
for (closest_2nd_dis in Sys.glob(file.path("../closest.2nd.plus*.to.1st.plus.*.indep.DHS.control.bed"))) {
print(closest_2nd_dis)
anchor_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.*.to.1st.plus.')[[1]][2]), ".indep.DHS.control.bed")[[1]][1]
print(anchor_3mer)
query_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.')[[1]][2]), ".to.1st.plus.*.indep.DHS.control.bed")[[1]][1]
print(query_3mer)
temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], anchor_3mer, query_3mer))
colnames(temp) = c("dis", "anchor_3mer", "query_3mer")
temp$dis=as.integer(temp$dis)
temp$abs.dis=abs(temp$dis)
actual_frequencies = calculate_actual_frequency(temp$abs.dis)
temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
df.ctrl = rbind(df.ctrl,temp1)
}
str(df.ctrl)
unique(df.ctrl$anchor_3mer)
unique(df.ctrl$query_3mer)
nrow(df.ctrl)
head(df.ctrl)
df.ctrl$status="MCF7_DHS_regions"
quantiles=c("quantile0.8", "quantile0.6", "quantile0.4", "quantile0.2")
for (quantile in quantiles){
print(quantile)
df.peak.nomotif = data.frame(matrix(nrow = 0, ncol = 5))
colnames(df.peak.nomotif) = c("dis","anchor_3mer", "query_3mer","abs.dis", "actual_freq")
for (closest_2nd_dis in Sys.glob(file.path(paste0("./closest.2nd.plus*.to.1st.plus.*.GATA3.without.motifs.", quantile, ".bed")))) {
print(closest_2nd_dis)
anchor_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.*.to.1st.plus.')[[1]][2]), paste0(".GATA3.without.motifs.",quantile,".bed"))[[1]][1]
print(anchor_3mer)
query_3mer =strsplit((strsplit(strsplit(closest_2nd_dis, "/")[[1]][length(strsplit(closest_2nd_dis, "/")[[1]])], 'closest.2nd.plus.')[[1]][2]), paste0(".to.1st.plus.", anchor_3mer, ".GATA3.without.motifs.",quantile,".bed"))[[1]][1]
print(query_3mer)
temp = as.data.frame(cbind(read.table(closest_2nd_dis, header=F, comment.char='')[,11], anchor_3mer, query_3mer))
colnames(temp) = c("dis", "anchor_3mer", "query_3mer")
temp$dis=as.integer(temp$dis)
temp$abs.dis=abs(temp$dis)
actual_frequencies = calculate_actual_frequency(temp$abs.dis)
temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
df.peak.nomotif = rbind(df.peak.nomotif,temp1)
}
df.peak.nomotif$status=paste0("GATA3_peak_without_motifs_", quantile)
df.plot=rbind(df.peak.nomotif, df.ctrl)
df.plot$pattern = paste0(df.plot$anchor_3mer, "-", df.plot$query_3mer)
df.plot$pattern = factor(df.plot$pattern, levels = c("ATC-ATC", "GAT-ATC", "ATC-GAT", "GAT-GAT"))
df.plot$status = factor(df.plot$status, levels = c(paste0("GATA3_peak_without_motifs_", quantile), "MCF7_DHS_regions"))
library(lattice)
library(latticeExtra)
pdf(paste0('xy_closest_2nd_3mer_to_closest_1st_3mer_to_GATA3_peaks_without_motifs_', quantile, '.pdf'), width=15,height=5)
print(
xyplot(actual_freq ~ abs.dis | pattern,
#data = df.plot[!duplicated(df.plot), ],
data = df.plot,
groups = status,
auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
aspect = 1,
xlim=c(0,50),
ylim=c(0, 0.15),
layout=c(4,1),
#type = c('p', 'smooth'),
xlab = "relative distance (bp) between zinc finger (anchor at G/C)",
ylab="Frequency",
main=paste0("GATA3 peak without motifs ", quantile),
between=list(y=1.0),
scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.line = list(col=c("red", "black"), lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
panel.densityplot(x, data = df.plot,
from=0,
to=50,
lty = c(1),
lwd=2,
darg=list(bw = "nrd0", kernel="gaussian"),
type = "count",
col=c("pink","grey"), ...)
panel.xyplot(x, y,
col=c("red","black"),
pch=18,
cex=0.6,...)
})
)
dev.off()
}
xy plot
GATA3 peaks without motifs:
calculate_actual_frequency <- function(data) {
# Use table() to create a frequency table
actual_frequencies <- table(data)/length(data)
result <- data.frame(value = names(actual_frequencies), actual_freq = as.vector(actual_frequencies))
return(result)
}
df.peak.nomotif = data.frame(matrix(nrow = 0, ncol = 5))
colnames(df.peak.nomotif) = c("dis","closest_3mer", "abs.dis", "actual_freq")
for (closest_1st_dis in Sys.glob(file.path("./closest.1st.*.to.GATA3_without_motifs_quantile1.bed"))) {
print(closest_1st_dis)
closest_3mer =strsplit((strsplit(strsplit(closest_1st_dis, "/")[[1]][length(strsplit(closest_1st_dis, "/")[[1]])], 'closest.1st.')[[1]][2]), ".to.GATA3_without_motifs_quantile1.bed")[[1]][1]
print(closest_3mer)
temp = as.data.frame(cbind(read.table(closest_1st_dis, header=F, comment.char='')[,11], closest_3mer))
colnames(temp) = c("dis", "closest_3mer")
temp$dis=as.integer(temp$dis)
temp$abs.dis=abs(temp$dis)
actual_frequencies = calculate_actual_frequency(temp$abs.dis)
temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
df.peak.nomotif = rbind(df.peak.nomotif,temp1)
}
str(df.peak.nomotif)
unique(df.peak.nomotif$closest_3mer)
nrow(df.peak.nomotif)
head(df.peak.nomotif)
df.peak.nomotif$status="GATA3_peak_without_motifs_quantile1"
DHS regions (neg ctrl):
df.ctrl = data.frame(matrix(nrow = 0, ncol = 5))
colnames(df.ctrl) = c("dis","closest_3mer", "abs.dis", "actual_freq")
for (closest_1st_dis in Sys.glob(file.path("./closest.1st.*.to.indep.DHS.control.consensus.bed"))) {
print(closest_1st_dis)
closest_3mer =strsplit((strsplit(strsplit(closest_1st_dis, "/")[[1]][length(strsplit(closest_1st_dis, "/")[[1]])], 'closest.1st.')[[1]][2]), ".to.indep.DHS.control.consensus.bed")[[1]][1]
print(closest_3mer)
temp = as.data.frame(cbind(read.table(closest_1st_dis, header=F, comment.char='')[,11], closest_3mer))
colnames(temp) = c("dis", "closest_3mer")
temp$dis=as.integer(temp$dis)
temp$abs.dis=abs(temp$dis)
actual_frequencies = calculate_actual_frequency(temp$abs.dis)
temp1= merge(temp, actual_frequencies, by.x = "abs.dis", by.y = "value", all.x = TRUE)
df.ctrl = rbind(df.ctrl,temp1)
}
str(df.ctrl)
unique(df.ctrl$closest_3mer)
nrow(df.ctrl)
head(df.ctrl)
df.ctrl$status="MCF7_DHS_regions"
df.plot=rbind(df.peak.nomotif, df.ctrl)
df.plot$closest_3mer = factor(df.plot$closest_3mer, levels = c("plus.GAT", "minus.ATC" , "minus.GAT", "plus.ATC"))
df.plot$status = factor(df.plot$status, levels = c("GATA3_peak_without_motifs_quantile1", "MCF7_DHS_regions"))
nrow(df.plot)
nrow(df.plot[!duplicated(df.plot), ])
summary(df.plot[!duplicated(df.plot), ])
library(lattice)
library(latticeExtra)
pdf('xy_closest_1st_3mer_to_peaks_without_motifs_quantile1.pdf', width=15,height=5)
print(
xyplot(actual_freq ~ abs.dis | closest_3mer,
#data = df.plot[!duplicated(df.plot), ],
data = df.plot,
groups = status,
auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
aspect = 1,
xlim=c(0,50),
ylim=c(0, 0.15),
layout=c(4,1),
#type = c('p', 'smooth'),
xlab = "relative distance (bp) between zinc finger (anchor at G/C)",
ylab="Frequency",
#main="Independent DHS Regions",
between=list(y=1.0),
scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
par.settings = list(superpose.line = list(col=c("red", "black"), lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(1, 10, 1)), col = "grey90")
panel.densityplot(x, data = df.plot,
from=0,
to=50,
lty = c(1),
lwd=2,
darg=list(bw = "nrd0", kernel="gaussian"),
type = "count",
col=c("pink","grey"), ...)
panel.xyplot(x, y,
col=c("red","black"),
pch=18,
cex=0.6,...)
})
)
dev.off()
Again, we can demonstrate the relative enrichment of closest 3mer by subtracting the actual frequency of GATA3 peaks from that of the DHS regions at each relative distance.
# GATA3 peaks
uniq.df.peak.nomotif=df.peak.nomotif[!duplicated(df.peak.nomotif), 1:4]
#DHS regions
uniq.df.ctrl=df.ctrl[!duplicated(df.ctrl), 1:4]
colnames(uniq.df.ctrl)[4]="actual_freq_DHS"
#calculate the relative frequency
#by subtraction of actual frequency between GATA3 peaks and DHS regions
df.plot2=merge(uniq.df.peak.nomotif, uniq.df.ctrl, by=c("abs.dis", "dis", "closest_3mer"), all.x = TRUE)
df.plot2$rel_freq <- ifelse(is.na(df.plot2$actual_freq_DHS), NA, df.plot2$actual_freq - df.plot2$actual_freq_DHS)
df.plot2$closest_3mer = factor(df.plot2$closest_3mer, levels = c("plus.GAT", "minus.ATC" , "minus.GAT", "plus.ATC"))
library(lattice)
library(latticeExtra)
pdf(paste0('xy_closest_1st_3mer_GATA3_peaks_without_motifs_quantile1_compare_to_DHS.pdf'), width=15,height=5)
print(xyplot(rel_freq ~ abs.dis | closest_3mer,
data = df.plot2,
#groups = pattern,
#auto.key = list(space = "right", lines=TRUE, points=FALSE, cex = 1),
aspect = 1,
xlim=c(0,50),
ylim=c(0, 0.1),
layout=c(4,1),
#type = c('p', 'smooth'),
xlab = "relative distance (bp) between zinc finger (anchor at G/C)",
ylab="Relative Frequency (GATA3 peaks - DHS regions)",
#main=paste0("GATA3 peak with ", motif),
between=list(y=1.0),
scales = list(x = list(relation="free", rot = 45, at = seq(from = 0, to = 50, by = 5))),
#par.settings = list(superpose.line = list(col=c("pink", "skyblue"), lwd=2), strip.background=list(col="grey85")),
panel = function(x,y,...) {panel.abline(v=c(seq(0, 15, 1)), col = "grey90")
panel.densityplot(x, data = df.plot2,
from=0,
to=50,
lty = c(1),
lwd=2,
darg=list(bw = "nrd0", kernel="gaussian"),
type = "count",
col="pink", ...)
panel.xyplot(x, y,
col="red",
pch=18,
cex=0.6,...)
})
)
dev.off()
Input fasta: GATA3 peaks without motifs 12345678:
(cd /home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/RSAT/peak_161_without_motifs_12345678)
module load bedtools
genome=/home/FCAM/ssun/Genome/hg38.fa
dir=/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/
fastaFromBed -fi $genome -bed ${dir}without_motifs_123456_78_161bp_mast.bed -fo without_motifs_123456_78_161bp_mast.fasta
Use the previously made background file.
source ~/miniconda3/bin/activate
conda activate rsat
rsat dyad-analysis -o GATA3_peaks_without_motifs_123456_78_161win_mast_RSAT_dyad.txt -i without_motifs_123456_78_161bp_mast.fasta -format FastA -l 3 -sp 0-20 -expfreq ../ENCODE.MCF7.DHS.background4.txt -return exp_occ,occ,ratio -sort -seqtype dna
# -1str single strand count; only the direct strand is considered for oligonucleotide and dyad occurrence counting.
# -2str count on oth strands
#The occurrences of each oligonucleotide are summed on both strands. This allows to detect elements which act in an orientation-insensitive way (as is generally the case for yeast upstream elements).
# -type dyad_type (dr|ir|any) any (default)
#In order to fasten execution, the program can be asked to restrict its analysis to symmetric dyads.
#Three types are accepted
#dr direct repeats: the second element is the same as the first one
#ir inverted repeats: the second element is the reverse complement of the first one.
#rep repeats: direct and inverted repeats are evaluated
#any (default)
#When selecting the option any, the analysis is performed on all non-symmetric dyads as well.
View the top patterns ranked by obs/exp ratio:
cat GATA3_peaks_without_motifs_123456_78_161win_mast_RSAT_dyad.txt | sort -k8,8nr | head
## agan{0}taa agan{0}taa|ttan{0}tct 0.0000096090146 1258 55.24 0 1258 22.77
## agan{0}tta agan{0}tta|taan{0}tct 0.0001052931460 2453 605.36 22 2475 4.05
## cttn{0}atc cttn{0}atc|gatn{0}aag 0.0000784961757 1618 451.29 1 1619 3.59
## gatn{0}aac gatn{0}aac|gttn{0}atc 0.0000457443231 894 263.00 0 894 3.40
## atcn{9}atc atcn{9}atc|gatn{9}gat 0.0002062495205 3746 1105.62 298 4044 3.39
## ccgn{0}ccg ccgn{0}ccg|cggn{0}cgg 0.0003669019521 6932 2109.41 1088 8020 3.29
## gatn{10}ata gatn{10}ata|tatn{10}atc 0.0001643987710 2878 874.66 142 3020 3.29
## gatn{11}ata gatn{11}ata|tatn{11}atc 0.0001563266321 2714 825.43 78 2792 3.29
## attn{0}atc attn{0}atc|gatn{0}aat 0.0000619849111 1141 356.37 0 1141 3.20
## atcn{10}atc atcn{10}atc|gatn{10}gat 0.0002081405868 3470 1107.39 178 3648 3.13
The top significant pattern is a single site AGATAA.
In GATA3 peaks exhibiting motif 1 enrichment, we anticipate pinpointing peaks with a GAT sequence proximal to their summit, given their characterization as enriched with a GATA3-like motif 1. However, upon sorting the closest GAT sequences to the summit within the “.1st.plus.GAT.to.GATA3.with.motif_1.bed” file by their distances, we observed significant deviations in certain instances.
For instance, consider GATA_ChIP_peak_24656, which was identified as enriched within a 101bp window containing motif 1. Surprisingly, the closest GAT sequence is positioned 581672bp away. Upon visual inspection using the UCSC genome browser, we observed a GAT sequence located 23bp downstream of the peak summit, followed by an ATC sequence at a spacing of 3bp, consistent with the expected pattern for motif 1-enriched peaks.
The discrepancy in identifying these GAT coordinates by Seqoutbias, which utilizes a read size of 1000, suggests a potential limitation in its approach to searching within the given genome read size. Further investigation is warranted to understand the underlying reasons for this inconsistency.
UCSC genome browser section: https://genome.ucsc.edu/s/ssun/3mer_coordinates_fail
It could due to the mappability of the sequence itself, so seqOutbias is not generating the coordinates for that sequences.
check if this region falls into the highly repetitive elements (RepeatMasker)
Blat this sequence, and found the sequence map to two regions of the genome. Since this sequence is not uniquely mappable, it will not generate the 3mer.
Quantify how many peaks contain their closest GAT at distance >200bp from peak summit.
#abs.dis=200
awk '$11 > 200 || $11 < -200 {count++} END {print count}' closest.1st.plus.GAT.to.GATA3.with.motif_1.bed
awk '$11 > 200 || $11 < -200 {count++} END {print count}' closest.1st.plus.GAT.to.GATA3.with.motif_2.bed
awk '$11 > 200 || $11 < -200 {count++} END {print count}' closest.1st.plus.GAT.to.GATA3.with.motif_4.bed
awk '$11 > 200 || $11 < -200 {count++} END {print count}' closest.1st.plus.GAT.to.GATA3.with.motif_5.bed
awk '$11 > 200 || $11 < -200 {count++} END {print count}' closest.1st.plus.GAT.to.GATA3.with.motif_6.bed
awk '$11 > 200 || $11 < -200 {count++} END {print count}' closest.1st.plus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed
awk '$11 > 200 || $11 < -200 {count++} END {print count}' closest.1st.plus.GAT.to.GATA3_without_motifs_quantile1.bed
awk '$11 > 200 || $11 < -200 {count++} END {print count}' closest.1st.plus.GAT.to.indep.DHS.control.consensus.bed
## 67
## 53
## 90
## 19
## 50
## 1664
## 120
## 2829
wc -l closest.1st.plus.GAT.to.GATA3.with.motif_1.bed
wc -l closest.1st.plus.GAT.to.GATA3.with.motif_2.bed
wc -l closest.1st.plus.GAT.to.GATA3.with.motif_4.bed
wc -l closest.1st.plus.GAT.to.GATA3.with.motif_5.bed
wc -l closest.1st.plus.GAT.to.GATA3.with.motif_6.bed
wc -l closest.1st.plus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed
wc -l closest.1st.plus.GAT.to.GATA3_without_motifs_quantile1.bed
wc -l closest.1st.plus.GAT.to.indep.DHS.control.consensus.bed
## 12470 closest.1st.plus.GAT.to.GATA3.with.motif_1.bed
## 11475 closest.1st.plus.GAT.to.GATA3.with.motif_2.bed
## 6505 closest.1st.plus.GAT.to.GATA3.with.motif_4.bed
## 4167 closest.1st.plus.GAT.to.GATA3.with.motif_5.bed
## 5363 closest.1st.plus.GAT.to.GATA3.with.motif_6.bed
## 37308 closest.1st.plus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed
## 7462 closest.1st.plus.GAT.to.GATA3_without_motifs_quantile1.bed
## 57906 closest.1st.plus.GAT.to.indep.DHS.control.consensus.bed
67/12470=0.005372895
53/11475=0.004618736
90/6505=0.01383551
19/4167=0.004559635
50/5363=0.00932314
The occurrences of our positive control peak set having a closest GAT at a distance larger than 200bp from the peak summit are less than 1%.
The occurrences of the negative control having a closest GAT outside the 400bp window around summit is 4.88%.
The occurences of all peaks without motifs having a closest GAT outside the 400bp window around summit is 4.46%.
The top 20% quantile of peaks without motifs that has a higher peak intensity is 1.6%.
df <- data.frame(
peak_set = c("GATA3.with.motif_1", "GATA3.with.motif_2", "GATA3.with.motif_4", "GATA3.with.motif_5", "GATA3.with.motif_6", "GATA3_without_motifs", "GATA3_without_motifs_quantile1", "indep.DHS.control.consensus"),
out_counts = c(67, 53, 90, 19, 50, 1664, 120, 2829),
all_counts = c(12470, 11475, 6505, 4167, 5363, 37308, 7462, 57906)
)
df$in_counts=df$all_counts-df$out_counts
df$peak_set=factor(df$peak_set, levels = c("GATA3.with.motif_1", "GATA3.with.motif_2", "GATA3.with.motif_4", "GATA3.with.motif_5", "GATA3.with.motif_6", "GATA3_without_motifs", "GATA3_without_motifs_quantile1", "indep.DHS.control.consensus"))
library(lattice)
barchart(in_counts + out_counts ~ peak_set,
stack=TRUE,
data = df,
horizontal = FALSE,
col = c("lightblue", "pink"),
key = list(space = "right", rectangles=list(pch = c(15, 15), col = c("pink", "lightblue")), text=list(c("closest abs distance >200bp", "closest abs distance <200bp"), cex=1)),
ylab="number of peaks",
scales=list(x=list(rot=45, cex=0.6)))
To see how often these deviations happen, we can quantify the 1st closest GAT distances to GATA3 peak with motifs (positive controls), and see how distances data distributed within each peak set:
plus-GAT
ls closest.1st.plus.GAT.*.bed
## closest.1st.plus.GAT.to.GATA3.with.motif_1.bed
## closest.1st.plus.GAT.to.GATA3.with.motif_2.bed
## closest.1st.plus.GAT.to.GATA3.with.motif_4.bed
## closest.1st.plus.GAT.to.GATA3.with.motif_5.bed
## closest.1st.plus.GAT.to.GATA3.with.motif_6.bed
## closest.1st.plus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed
## closest.1st.plus.GAT.to.GATA3_without_motifs_quantile1.bed
## closest.1st.plus.GAT.to.indep.DHS.control.consensus.bed
df.plot = data.frame(matrix(nrow = 0, ncol = 4))
colnames(df.plot) = c("dis","anchor_3mer", "peak_set","abs.dis")
for (closest_1st_dis in Sys.glob(file.path("./closest.1st.plus.GAT.to.*.bed"))) {
print(closest_1st_dis)
anchor_3mer ="plus.GAT"
peak_set =strsplit((strsplit(strsplit(closest_1st_dis, "/")[[1]][length(strsplit(closest_1st_dis, "/")[[1]])], 'closest.1st.plus.GAT.to.')[[1]][2]), ".bed")[[1]][1]
print(peak_set)
temp = as.data.frame(cbind(read.table(closest_1st_dis, header=F, comment.char='')[,11], anchor_3mer, peak_set))
colnames(temp) = c("dis", "anchor_3mer", "peak_set")
temp$dis=as.integer(temp$dis)
temp$abs.dis=abs(temp$dis)
df.plot = rbind(df.plot,temp)
}
## [1] "./closest.1st.plus.GAT.to.GATA3.with.motif_1.bed"
## [1] "GATA3.with.motif_1"
## [1] "./closest.1st.plus.GAT.to.GATA3.with.motif_2.bed"
## [1] "GATA3.with.motif_2"
## [1] "./closest.1st.plus.GAT.to.GATA3.with.motif_4.bed"
## [1] "GATA3.with.motif_4"
## [1] "./closest.1st.plus.GAT.to.GATA3.with.motif_5.bed"
## [1] "GATA3.with.motif_5"
## [1] "./closest.1st.plus.GAT.to.GATA3.with.motif_6.bed"
## [1] "GATA3.with.motif_6"
## [1] "./closest.1st.plus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed"
## [1] "GATA3_without_motifs_123456_78_161bp_mast"
## [1] "./closest.1st.plus.GAT.to.GATA3_without_motifs_quantile1.bed"
## [1] "GATA3_without_motifs_quantile1"
## [1] "./closest.1st.plus.GAT.to.indep.DHS.control.consensus.bed"
## [1] "indep.DHS.control.consensus"
df.plot$peak_set = factor(df.plot$peak_set, levels = c("GATA3.with.motif_1", "GATA3.with.motif_2", "GATA3.with.motif_4", "GATA3.with.motif_5", "GATA3.with.motif_6", "GATA3_without_motifs_123456_78_161bp_mast", "GATA3_without_motifs_quantile1", "indep.DHS.control.consensus"))
summary(df.plot)
## dis anchor_3mer
## Min. : 0.0 Length:142656
## 1st Qu.: 8.0 Class :character
## Median : 24.0 Mode :character
## Mean : 585.7
## 3rd Qu.: 59.0
## Max. :734003.0
##
## peak_set abs.dis
## indep.DHS.control.consensus :57906 Min. : 0.0
## GATA3_without_motifs_123456_78_161bp_mast:37308 1st Qu.: 8.0
## GATA3.with.motif_1 :12470 Median : 24.0
## GATA3.with.motif_2 :11475 Mean : 585.7
## GATA3_without_motifs_quantile1 : 7462 3rd Qu.: 59.0
## GATA3.with.motif_4 : 6505 Max. :734003.0
## (Other) : 9530
str(df.plot)
## 'data.frame': 142656 obs. of 4 variables:
## $ dis : int 10 16 0 0 3 3 3 0 2 20 ...
## $ anchor_3mer: chr "plus.GAT" "plus.GAT" "plus.GAT" "plus.GAT" ...
## $ peak_set : Factor w/ 8 levels "GATA3.with.motif_1",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ abs.dis : int 10 16 0 0 3 3 3 0 2 20 ...
library(lattice)
library(latticeExtra)
my.settings <- list(
superpose.polygon=list(col=c("black", "grey"), border="transparent"),
strip.background=list(col="grey80", cex = 0.6),
strip.border=list(col="black")
)
bwplot(log10(abs.dis) ~ peak_set,
data=df.plot,
do.out=FALSE,
ylim=c(-1, 7),
par.settings = my.settings,
panel = function(...) {
panel.abline(h= 1.9, lty =2, col="red") # log10(80)~1.9
panel.abline(h= 1.3, lty =2, col="blue") # log10(20)~1.3
panel.stripplot(...,jitter.data = T, pch=4, cex=0.4, col="grey")
#panel.violin(..., col="transparent", varwidth = FALSE)
panel.bwplot(...)},
scales=list(x=list(rot=45, cex=0.6)),
xlab = "peak set",
ylab = "closest plus GAT to peak summit log10(abs.dis)")
my.settings <- list(
superpose.polygon=list(col=c("black", "grey"), border="transparent"),
strip.background=list(col="grey80", cex = 0.6),
strip.border=list(col="black")
)
bwplot(log10(abs.dis) ~ peak_set,
data=df.plot,
do.out=FALSE,
ylim=c(-1, 7),
par.settings = my.settings,
scales=list(x=list(rot=45, cex=0.6)),
xlab = "peak set",
ylab = "closest plus GAT to peak summit log10(abs.dis)",
panel = function(...) {
#panel.bwplot(...)
panel.abline(h= 2.3, lty =2, col="grey40") # log10(200)~2.3
panel.abline(h= 1.9, lty =2, col="red") # log10(80)~1.9
panel.abline(h= 1.3, lty =2, col="blue") # log10(20)~1.3
panel.stripplot(..., jitter.data = TRUE, pch=4, cex=0.4, col="grey")
panel.violin(..., col="transparent", border="black", do.out = FALSE, varwidth = FALSE)
}
)
bwplot(log10(abs.dis) ~ peak_set,
data=df.plot,
#do.out=FALSE,
ylim=c(-1, 7),
panel = function(..., box.ratio) {
panel.abline(h= 1.9, lty =2, col="red") # log10(80)~1.9
panel.abline(h= 1.3, lty =2, col="blue") # log10(20)~1.3
panel.stripplot(...,jitter.data = T, pch=4, cex=0.4, col="grey")
panel.violin(..., col="pink", alpha=0.4, box.ratio = box.ratio, do.out = F, border="black", varwidth = TRUE)
},
scales=list(x=list(rot=45, cex=0.6)),
xlab = "peak set",
ylab = "closest plus GAT to peak summit log10(abs.dis)")
df.plot1=df.plot[df.plot$peak_set %in% c("GATA3.with.motif_1", "GATA3.with.motif_2", "GATA3.with.motif_4", "GATA3.with.motif_5", "GATA3.with.motif_6","indep.DHS.control.consensus"), ]
bwplot(log10(abs.dis) ~ peak_set,
data=df.plot1,
#do.out=FALSE,
ylim=c(-1, 7),
panel = function(..., box.ratio) {
panel.abline(h= 2.3, lty =2, col="grey40") # log10(200)~2.3
panel.abline(h= 1.9, lty =2, col="red") # log10(80)~1.9
panel.abline(h= 1.3, lty =2, col="blue") # log10(20)~1.3
panel.stripplot(...,jitter.data = T, pch=4, cex=0.4, col="grey")
panel.violin(..., col="pink", alpha=0.4, box.ratio = box.ratio, do.out = F, border="black", varwidth = TRUE)
},
scales=list(x=list(rot=45, cex=0.6)),
xlab = "peak set",
ylab = "closest plus GAT to peak summit log10(abs.dis)")
We can observe that for GATA3 peaks with motifs 1, 4, and 5, most peaks have a closest GAT within a 161bp window around the peak summit (log10(80)=1.9, marked by the red dashed line). Additionally, for all positive GATA3 peak sets, the majority of distances fall within a 40bp window around the summit (log10(20)=1.3, marked by the blue dashed line).
In the negative control, the first GAT appears to be more distant from the peak summit.
plus-ATC (same as minus-GAT)
ls closest.1st.minus.GAT.*.bed
## closest.1st.minus.GAT.to.GATA3.with.motif_1.bed
## closest.1st.minus.GAT.to.GATA3.with.motif_2.bed
## closest.1st.minus.GAT.to.GATA3.with.motif_4.bed
## closest.1st.minus.GAT.to.GATA3.with.motif_5.bed
## closest.1st.minus.GAT.to.GATA3.with.motif_6.bed
## closest.1st.minus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed
## closest.1st.minus.GAT.to.GATA3_without_motifs_quantile1.bed
## closest.1st.minus.GAT.to.indep.DHS.control.consensus.bed
df.plot = data.frame(matrix(nrow = 0, ncol = 4))
colnames(df.plot) = c("dis","anchor_3mer", "peak_set","abs.dis")
for (closest_1st_dis in Sys.glob(file.path("./closest.1st.minus.GAT.to.*.bed"))) {
print(closest_1st_dis)
anchor_3mer ="minus.GAT"
peak_set =strsplit((strsplit(strsplit(closest_1st_dis, "/")[[1]][length(strsplit(closest_1st_dis, "/")[[1]])], 'closest.1st.minus.GAT.to.')[[1]][2]), ".bed")[[1]][1]
print(peak_set)
temp = as.data.frame(cbind(read.table(closest_1st_dis, header=F, comment.char='')[,11], anchor_3mer, peak_set))
colnames(temp) = c("dis", "anchor_3mer", "peak_set")
temp$dis=as.integer(temp$dis)
temp$abs.dis=abs(temp$dis)
df.plot = rbind(df.plot,temp)
}
## [1] "./closest.1st.minus.GAT.to.GATA3.with.motif_1.bed"
## [1] "GATA3.with.motif_1"
## [1] "./closest.1st.minus.GAT.to.GATA3.with.motif_2.bed"
## [1] "GATA3.with.motif_2"
## [1] "./closest.1st.minus.GAT.to.GATA3.with.motif_4.bed"
## [1] "GATA3.with.motif_4"
## [1] "./closest.1st.minus.GAT.to.GATA3.with.motif_5.bed"
## [1] "GATA3.with.motif_5"
## [1] "./closest.1st.minus.GAT.to.GATA3.with.motif_6.bed"
## [1] "GATA3.with.motif_6"
## [1] "./closest.1st.minus.GAT.to.GATA3_without_motifs_123456_78_161bp_mast.bed"
## [1] "GATA3_without_motifs_123456_78_161bp_mast"
## [1] "./closest.1st.minus.GAT.to.GATA3_without_motifs_quantile1.bed"
## [1] "GATA3_without_motifs_quantile1"
## [1] "./closest.1st.minus.GAT.to.indep.DHS.control.consensus.bed"
## [1] "indep.DHS.control.consensus"
df.plot$peak_set = factor(df.plot$peak_set, levels = c("GATA3.with.motif_1", "GATA3.with.motif_2", "GATA3.with.motif_4", "GATA3.with.motif_5", "GATA3.with.motif_6", "GATA3_without_motifs_123456_78_161bp_mast", "GATA3_without_motifs_quantile1", "indep.DHS.control.consensus"))
summary(df.plot)
## dis anchor_3mer
## Min. : 0.0 Length:142656
## 1st Qu.: 9.0 Class :character
## Median : 24.0 Mode :character
## Mean : 606.2
## 3rd Qu.: 59.0
## Max. :890194.0
##
## peak_set abs.dis
## indep.DHS.control.consensus :57906 Min. : 0.0
## GATA3_without_motifs_123456_78_161bp_mast:37308 1st Qu.: 9.0
## GATA3.with.motif_1 :12470 Median : 24.0
## GATA3.with.motif_2 :11475 Mean : 606.2
## GATA3_without_motifs_quantile1 : 7462 3rd Qu.: 59.0
## GATA3.with.motif_4 : 6505 Max. :890194.0
## (Other) : 9530
str(df.plot)
## 'data.frame': 142656 obs. of 4 variables:
## $ dis : int 16 10 18 46 1 8 1 5 57 18 ...
## $ anchor_3mer: chr "minus.GAT" "minus.GAT" "minus.GAT" "minus.GAT" ...
## $ peak_set : Factor w/ 8 levels "GATA3.with.motif_1",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ abs.dis : int 16 10 18 46 1 8 1 5 57 18 ...
bwplot(log10(abs.dis) ~ peak_set,
data=df.plot,
#do.out=FALSE,
ylim=c(-1, 7),
panel = function(..., box.ratio) {
panel.abline(h= 1.9, lty =2, col="red") # log10(80)~1.9
panel.abline(h= 1.3, lty =2, col="blue") # log10(20)~1.3
panel.stripplot(...,jitter.data = T, pch=4, cex=0.4, col="grey")
panel.violin(...,col="lightblue", alpha=0.4, border="black", varwidth = TRUE)
},
scales=list(x=list(rot=45, cex=0.6)),
xlab = "peak set",
ylab = "closest minus GAT to peak summit log10(abs.dis)")
df.plot1=df.plot[df.plot$peak_set %in% c("GATA3.with.motif_1", "GATA3.with.motif_2", "GATA3.with.motif_4", "GATA3.with.motif_5", "GATA3.with.motif_6","indep.DHS.control.consensus"), ]
bwplot(log10(abs.dis) ~ peak_set,
data=df.plot1,
#do.out=FALSE,
ylim=c(-1, 7),
panel = function(..., box.ratio) {
#panel.abline(h= 2.3, lty =2, col="grey40") # log10(200)~2.3
panel.abline(h= 1.9, lty =2, col="red") # log10(80)~1.9
panel.abline(h= 1.3, lty =2, col="blue") # log10(20)~1.3
panel.stripplot(...,jitter.data = TRUE, pch=4, cex=0.4, col="grey")
panel.violin(..., col="lightblue", alpha=0.4, box.ratio = box.ratio, do.out = FALSE, border="black", varwidth = FALSE)
},
scales=list(x=list(rot=45, cex=0.6)),
xlab = "peak set",
ylab = "closest minus GAT to peak summit log10(abs.dis)")
Goal: given a prioritized 3mer list, we can generate the closest 3mer coordinates to a given sets of peak summits.
240225_closestBed.R:
(cd /home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/closest_other_3mer)
#!/usr/bin/env Rscript
Args=commandArgs(TRUE)
# closestBed function
bedTools.closest.mod <- function(functionstring="/home/FCAM/ssun/packages/bedtools2/bin/closestBed",bed1,bed2,opt.string="") {
options(scipen =99) # not use scientific notation when writing out
#write bed formatted data.frames to tempfile
write.table(bed1,file= 'a.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
write.table(bed2,file= 'b.file.sorted.bed', quote=F,sep="\t",col.names=F,row.names=F)
# create the command string and call the command using system()
# the command sort a and b file by coordinates
command1=paste('sort -k1,1 -k2,2n', 'a.file.bed', '> a.file.sorted.bed')
cat(command1,"\n") #sort -k1,1 -k2,2n a.file.bed > a.file.sorted.bed
try(system(command1))
#command2=paste('sort -k1,1 -k2,2n', 'b.file.bed', '> b.file.sorted.bed')
#cat(command2,"\n")
#try(system(command2))
# the command call closestBed on bed1 and bed2
command=paste(functionstring, opt.string,"-a",'a.file.sorted.bed',"-b",'b.file.sorted.bed',">",'out.file.bed',sep=" ")
cat(command,"\n")
try(system(command))
res=read.table('out.file.bed',header=F, comment.char='')
# remove intermediate files
command3=paste('rm', 'a.file.bed', 'a.file.sorted.bed', 'b.file.sorted.bed', 'out.file.bed')
cat(command3,"\n")
try(system(command3))
colnames(res) = c(colnames(bed1), colnames(bed2), 'dis' )
return(res)
}
dir1="/labs/Guertin/siyu/Sathyan_GATA3_ChIP_pool1_pool2/overrep_3mer/hg38_full_kmer3_rs1000/seqdump/"
dir2="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/"
prioritized_triplets = c("AAA", "TAA" ,"ATA" ,"TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")
library(bigWig)
for (triplet in prioritized_triplets){
print(triplet)
# 3mer genome coordinates
plus.triplet.file=read.table(file = Sys.glob(file.path(paste0(dir1,"hg38.3.3.3plus.*_",triplet, ".sorted.bed"))), sep="\t", header=FALSE)
# peak summits
GATA3_peak_summits=center.bed(read.table(paste0(dir2, "without_motifs_123456_78_161bp_mast.bed"), header=FALSE), upstreamWindow = 0, downstreamWindow = 0)
# closestBed--1st closest plus
closest.1st.plus.triplet.to.peak=bedTools.closest.mod(bed1 = GATA3_peak_summits[,1:3], bed2 = plus.triplet.file, opt.string = '-d -t first')
write.table(closest.1st.plus.triplet.to.peak,file= paste0('closest.1st.plus.',triplet,'.to.GATA3_without_motifs_123456_78_161bp_mast.bed'), quote=F,sep="\t",col.names=F,row.names=F)
}
runR.sh
#!/bin/bash
#SBATCH --job-name=runR.sh # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 8
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=128G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o runR.sh_%j.out
#SBATCH -e runR.sh_%j.err
module load R/4.1.2
Rscript 240225_closestBed.R
bedtools subtract.#!/bin/bash
#SBATCH --job-name=remove_1st_3mer.sh # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 16
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=200G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o remove_1st_3mer.sh_%j.out
#SBATCH -e remove_1st_3mer.sh_%j.err
input_dir1=/labs/Guertin/siyu/Sathyan_GATA3_ChIP_pool1_pool2/overrep_3mer/hg38_full_kmer3_rs1000/seqdump/
input_dir2=/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/closest_other_3mer/
prioritized_triplets=("AAA" "TAA" "ATA" "TTA" "AAT" "TAT" "GAT" "ATT" "TTT" "ATC" "AGA" "TCT" "TAG" "CTA")
module load bedtools
for triplet in "${prioritized_triplets[@]}"
do
echo $triplet
# plus
awk '{print $4, $5, $6, $7, $8, $9, $10}' ${input_dir2}closest.1st.plus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.bed | awk '{$1=$1}1' OFS="\t" | uniq | sort -k1,1 -k2,2n > closest.1st.plus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.uniq.sorted.bed
bedtools subtract -a ${input_dir1}hg38.3.3.3plus.*${triplet}.sorted.bed -b closest.1st.plus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.uniq.sorted.bed -f 1.00 -s > hg38.3.3.3plus.${triplet}_without_1st_plus_${triplet}_to_GATA3_without_motifs_123456_78_161bp_mast.bed
rm closest.1st.plus.${triplet}.to.GATA3_without_motifs_123456_78_161bp_mast.uniq.sorted.bed
done
240226_closestBed_peak_without_mot.R
#!/usr/bin/env Rscript
Args=commandArgs(TRUE)
bedTools.closest.mod <- function(functionstring="/home/FCAM/ssun/packages/bedtools2/bin/closestBed",bed1,bed2,opt.string="") {
options(scipen =99) # not use scientific notation when writing out
#write bed formatted data.frames to tempfile
write.table(bed1,file= 'a.file.bed', quote=F,sep="\t",col.names=F,row.names=F)
write.table(bed2,file= 'b.file.sorted.bed', quote=F,sep="\t",col.names=F,row.names=F)
# create the command string and call the command using system()
# the command sort a and b file by coordinates
command1=paste('sort -k1,1 -k2,2n', 'a.file.bed', '> a.file.sorted.bed')
cat(command1,"\n") #sort -k1,1 -k2,2n a.file.bed > a.file.sorted.bed
try(system(command1))
#command2=paste('sort -k1,1 -k2,2n', 'b.file.bed', '> b.file.sorted.bed')
#cat(command2,"\n")
#try(system(command2))
# the command call closestBed on bed1 and bed2
command=paste(functionstring, opt.string,"-a",'a.file.sorted.bed',"-b",'b.file.sorted.bed',">",'out.file.bed',sep=" ")
cat(command,"\n")
try(system(command))
res=read.table('out.file.bed',header=F, comment.char='')
# remove intermediate files
command3=paste('rm', 'a.file.bed', 'a.file.sorted.bed', 'b.file.sorted.bed', 'out.file.bed')
cat(command3,"\n")
try(system(command3))
colnames(res) = c(colnames(bed1), colnames(bed2), 'dis' )
return(res)
}
library(bigWig)
# List of prioritized triplets
prioritized_triplets <- c("AAA", "TAA", "ATA", "TTA", "AAT", "TAT", "GAT", "ATT", "TTT", "ATC", "AGA", "TCT", "TAG", "CTA")
# List to store non-redundant 6mers
all_6mers <- list()
for (i in 1:length(prioritized_triplets)) {
for (j in 1:length(prioritized_triplets)) {
pair <- c(prioritized_triplets[i], prioritized_triplets[j])
# Combine triplets to form a 6mer
sixmer <- paste(pair, collapse = "")
all_6mers <- c(all_6mers, list(sixmer))
}
}
# Create data frame with first 3 bases and last 3 bases
first_3_bases <- substr(all_6mers, 1, 3)
last_3_bases <- substr(all_6mers, 4, 6)
df <- data.frame(First_3_bases = first_3_bases, Last_3_bases = last_3_bases)
dir1="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/closest_other_3mer/"
dir2="/home/FCAM/ssun/GATA3_ChIP_PRO_July2023/ChIP_final/GAT_3mer_analaysis/peak_161_without_motifs_12345678/closest_other_3mer/closest_2nd_other_3mer/"
# DHS regions
for (i in 1:nrow(df)){
pattern1=df[i,1]
pattern2=df[i,2]
# anchor position: closest +/- pattern1
print(pattern1)
closest_plus_3mer_to_peak=fiveprime.bed(read.table(paste0(dir1, "closest.1st.plus.", pattern1, ".to.GATA3_without_motifs_123456_78_161bp_mast.bed"), header=FALSE)[,4:11], upstreamWindow = 0, downstreamWindow = 0)
# query 3mer coordinates on genome (without the overlapped closest 3mer coordinates)
print(pattern2)
plus.3mer.file=fiveprime.bed(read.table(file = paste0(dir2, "hg38.3.3.3plus.", pattern2, "_without_1st_plus_", pattern2, "_to_GATA3_without_motifs_123456_78_161bp_mast.bed"), sep="\t", header=FALSE), upstreamWindow = 0, downstreamWindow = 0)
# 2nd closest plus 3mer to closest plus 3mer
closest.2nd.plus.3mer.to.1st.plus.3mer=bedTools.closest.mod(bed1 = closest_plus_3mer_to_peak[,1:3], bed2 = plus.3mer.file, opt.string = '-d -t first')
write.table(closest.2nd.plus.3mer.to.1st.plus.3mer, file= paste0("closest.2nd.plus.", pattern2, ".to.1st.plus.", pattern1, ".to.GATA3_without_motifs_123456_78.bed"), quote=F,sep="\t",col.names=F,row.names=F)
}
runR_peak_without_mot.sh
#!/bin/bash
#SBATCH --job-name=runR_peak_without_mot.sh # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 8
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=128G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o runR_peak_without_mot.sh_%j.out
#SBATCH -e runR_peak_without_mot.sh_%j.err
hostname
mkdir peak_without_mot
cd peak_without_mot
module load R/4.1.2
Rscript ../240226_closestBed_peak_without_mot.R